Skip to main content

query/promql/
planner.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::collections::{BTreeSet, HashSet, VecDeque};
16use std::sync::Arc;
17use std::time::UNIX_EPOCH;
18
19use arrow::datatypes::IntervalDayTime;
20use async_recursion::async_recursion;
21use catalog::table_source::DfTableSourceProvider;
22use common_error::ext::ErrorExt;
23use common_error::status_code::StatusCode;
24use common_function::function::FunctionContext;
25use common_query::prelude::greptime_value;
26use datafusion::common::DFSchemaRef;
27use datafusion::datasource::DefaultTableSource;
28use datafusion::functions_aggregate::average::avg_udaf;
29use datafusion::functions_aggregate::count::count_udaf;
30use datafusion::functions_aggregate::expr_fn::first_value;
31use datafusion::functions_aggregate::min_max::{max_udaf, min_udaf};
32use datafusion::functions_aggregate::stddev::stddev_pop_udaf;
33use datafusion::functions_aggregate::sum::sum_udaf;
34use datafusion::functions_aggregate::variance::var_pop_udaf;
35use datafusion::functions_window::row_number::RowNumber;
36use datafusion::logical_expr::expr::{Alias, ScalarFunction, WindowFunction};
37use datafusion::logical_expr::expr_rewriter::normalize_cols;
38use datafusion::logical_expr::{
39    BinaryExpr, Cast, Extension, LogicalPlan, LogicalPlanBuilder, Operator,
40    ScalarUDF as ScalarUdfDef, WindowFrame, WindowFunctionDefinition,
41};
42use datafusion::prelude as df_prelude;
43use datafusion::prelude::{Column, Expr as DfExpr, JoinType};
44use datafusion::scalar::ScalarValue;
45use datafusion::sql::TableReference;
46use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
47use datafusion_common::{DFSchema, NullEquality};
48use datafusion_expr::expr::WindowFunctionParams;
49use datafusion_expr::utils::conjunction;
50use datafusion_expr::{
51    ExprSchemable, Literal, Projection, SortExpr, TableScan, TableSource, col, lit,
52};
53use datatypes::arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit};
54use datatypes::data_type::ConcreteDataType;
55use itertools::Itertools;
56use once_cell::sync::Lazy;
57use promql::extension_plan::{
58    Absent, EmptyMetric, HistogramFold, InstantManipulate, Millisecond, RangeManipulate,
59    ScalarCalculate, SeriesDivide, SeriesNormalize, UnionDistinctOn, build_special_time_expr,
60};
61use promql::functions::{
62    AbsentOverTime, AvgOverTime, Changes, CountOverTime, Delta, Deriv, DoubleExponentialSmoothing,
63    IDelta, Increase, LastOverTime, MaxOverTime, MinOverTime, PredictLinear, PresentOverTime,
64    QuantileOverTime, Rate, Resets, Round, StddevOverTime, StdvarOverTime, SumOverTime,
65    quantile_udaf,
66};
67use promql_parser::label::{METRIC_NAME, MatchOp, Matcher, Matchers};
68use promql_parser::parser::token::TokenType;
69use promql_parser::parser::value::ValueType;
70use promql_parser::parser::{
71    AggregateExpr, BinModifier, BinaryExpr as PromBinaryExpr, Call, EvalStmt, Expr as PromExpr,
72    Function, FunctionArgs as PromFunctionArgs, LabelModifier, MatrixSelector, NumberLiteral,
73    Offset, ParenExpr, StringLiteral, SubqueryExpr, UnaryExpr, VectorMatchCardinality,
74    VectorSelector, token,
75};
76use regex::{self, Regex};
77use snafu::{OptionExt, ResultExt, ensure};
78use store_api::metric_engine_consts::{
79    DATA_SCHEMA_TABLE_ID_COLUMN_NAME, DATA_SCHEMA_TSID_COLUMN_NAME, LOGICAL_TABLE_METADATA_KEY,
80    METRIC_ENGINE_NAME, is_metric_engine_internal_column,
81};
82use table::table::adapter::DfTableProviderAdapter;
83
84use crate::parser::{
85    ALIAS_NODE_NAME, ANALYZE_NODE_NAME, ANALYZE_VERBOSE_NODE_NAME, AliasExpr, EXPLAIN_NODE_NAME,
86    EXPLAIN_VERBOSE_NODE_NAME,
87};
88use crate::promql::error::{
89    CatalogSnafu, ColumnNotFoundSnafu, CombineTableColumnMismatchSnafu, DataFusionPlanningSnafu,
90    ExpectRangeSelectorSnafu, FunctionInvalidArgumentSnafu, InvalidDestinationLabelNameSnafu,
91    InvalidRegularExpressionSnafu, InvalidTimeRangeSnafu, MultiFieldsNotSupportedSnafu,
92    MultipleMetricMatchersSnafu, MultipleVectorSnafu, NoMetricMatcherSnafu, PromqlPlanNodeSnafu,
93    Result, SameLabelSetSnafu, TableNameNotFoundSnafu, TimeIndexNotFoundSnafu,
94    UnexpectedPlanExprSnafu, UnexpectedTokenSnafu, UnknownTableSnafu, UnsupportedExprSnafu,
95    UnsupportedMatcherOpSnafu, UnsupportedVectorMatchSnafu, ValueNotFoundSnafu,
96    ZeroRangeSelectorSnafu,
97};
98use crate::query_engine::QueryEngineState;
99
100/// `time()` function in PromQL.
101const SPECIAL_TIME_FUNCTION: &str = "time";
102/// `scalar()` function in PromQL.
103const SCALAR_FUNCTION: &str = "scalar";
104/// `absent()` function in PromQL
105const SPECIAL_ABSENT_FUNCTION: &str = "absent";
106/// `histogram_quantile` function in PromQL
107const SPECIAL_HISTOGRAM_QUANTILE: &str = "histogram_quantile";
108/// `vector` function in PromQL
109const SPECIAL_VECTOR_FUNCTION: &str = "vector";
110/// `le` column for conventional histogram.
111const LE_COLUMN_NAME: &str = "le";
112
113/// Static regex for validating label names according to Prometheus specification.
114/// Label names must match the regex: [a-zA-Z_][a-zA-Z0-9_]*
115static LABEL_NAME_REGEX: Lazy<Regex> =
116    Lazy::new(|| Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*$").unwrap());
117
118const DEFAULT_TIME_INDEX_COLUMN: &str = "time";
119
120/// default value column name for empty metric
121const DEFAULT_FIELD_COLUMN: &str = "value";
122
123/// Special modifier to project field columns under multi-field mode
124const FIELD_COLUMN_MATCHER: &str = "__field__";
125
126/// Special modifier for cross schema query
127const SCHEMA_COLUMN_MATCHER: &str = "__schema__";
128const DB_COLUMN_MATCHER: &str = "__database__";
129
130/// Threshold for scatter scan mode
131const MAX_SCATTER_POINTS: i64 = 400;
132
133/// Interval 1 hour in millisecond
134const INTERVAL_1H: i64 = 60 * 60 * 1000;
135
136#[derive(Default, Debug, Clone)]
137struct PromPlannerContext {
138    // query parameters
139    start: Millisecond,
140    end: Millisecond,
141    interval: Millisecond,
142    lookback_delta: Millisecond,
143
144    // planner states
145    table_name: Option<String>,
146    time_index_column: Option<String>,
147    field_columns: Vec<String>,
148    tag_columns: Vec<String>,
149    /// Use metric engine internal series identifier column (`__tsid`) as series key.
150    ///
151    /// This is enabled only when the underlying scan can provide `__tsid` (`UInt64`). The planner
152    /// uses it internally (e.g. as the series key for [`SeriesDivide`]) and strips it from the
153    /// final output.
154    use_tsid: bool,
155    /// The matcher for field columns `__field__`.
156    field_column_matcher: Option<Vec<Matcher>>,
157    /// The matcher for selectors (normal matchers).
158    selector_matcher: Vec<Matcher>,
159    schema_name: Option<String>,
160    /// The range in millisecond of range selector. None if there is no range selector.
161    range: Option<Millisecond>,
162}
163
164impl PromPlannerContext {
165    fn from_eval_stmt(stmt: &EvalStmt) -> Self {
166        Self {
167            start: stmt.start.duration_since(UNIX_EPOCH).unwrap().as_millis() as _,
168            end: stmt.end.duration_since(UNIX_EPOCH).unwrap().as_millis() as _,
169            interval: stmt.interval.as_millis() as _,
170            lookback_delta: stmt.lookback_delta.as_millis() as _,
171            ..Default::default()
172        }
173    }
174
175    /// Reset all planner states
176    fn reset(&mut self) {
177        self.table_name = None;
178        self.time_index_column = None;
179        self.field_columns = vec![];
180        self.tag_columns = vec![];
181        self.use_tsid = false;
182        self.field_column_matcher = None;
183        self.selector_matcher.clear();
184        self.schema_name = None;
185        self.range = None;
186    }
187
188    /// Reset table name and schema to empty
189    fn reset_table_name_and_schema(&mut self) {
190        self.table_name = Some(String::new());
191        self.schema_name = None;
192        self.use_tsid = false;
193    }
194
195    /// Check if `le` is present in tag columns
196    fn has_le_tag(&self) -> bool {
197        self.tag_columns.iter().any(|c| c.eq(&LE_COLUMN_NAME))
198    }
199}
200
201pub struct PromPlanner {
202    table_provider: DfTableSourceProvider,
203    ctx: PromPlannerContext,
204}
205
206impl PromPlanner {
207    pub async fn stmt_to_plan(
208        table_provider: DfTableSourceProvider,
209        stmt: &EvalStmt,
210        query_engine_state: &QueryEngineState,
211    ) -> Result<LogicalPlan> {
212        let mut planner = Self {
213            table_provider,
214            ctx: PromPlannerContext::from_eval_stmt(stmt),
215        };
216
217        let plan = planner
218            .prom_expr_to_plan(&stmt.expr, query_engine_state)
219            .await?;
220
221        // Never leak internal series identifier to output.
222        planner.strip_tsid_column(plan)
223    }
224
225    pub async fn prom_expr_to_plan(
226        &mut self,
227        prom_expr: &PromExpr,
228        query_engine_state: &QueryEngineState,
229    ) -> Result<LogicalPlan> {
230        self.prom_expr_to_plan_inner(prom_expr, false, query_engine_state)
231            .await
232    }
233
234    /**
235    Converts a PromQL expression to a logical plan.
236
237    NOTE:
238        The `timestamp_fn` indicates whether the PromQL `timestamp()` function is being evaluated in the current context.
239        If `true`, the planner generates a logical plan that projects the timestamp (time index) column
240        as the value column for each input row, implementing the PromQL `timestamp()` function semantics.
241        If `false`, the planner generates the standard logical plan for the given PromQL expression.
242    */
243    #[async_recursion]
244    async fn prom_expr_to_plan_inner(
245        &mut self,
246        prom_expr: &PromExpr,
247        timestamp_fn: bool,
248        query_engine_state: &QueryEngineState,
249    ) -> Result<LogicalPlan> {
250        let res = match prom_expr {
251            PromExpr::Aggregate(expr) => {
252                self.prom_aggr_expr_to_plan(query_engine_state, expr)
253                    .await?
254            }
255            PromExpr::Unary(expr) => {
256                self.prom_unary_expr_to_plan(query_engine_state, expr)
257                    .await?
258            }
259            PromExpr::Binary(expr) => {
260                self.prom_binary_expr_to_plan(query_engine_state, expr)
261                    .await?
262            }
263            PromExpr::Paren(ParenExpr { expr }) => {
264                self.prom_expr_to_plan_inner(expr, timestamp_fn, query_engine_state)
265                    .await?
266            }
267            PromExpr::Subquery(expr) => {
268                self.prom_subquery_expr_to_plan(query_engine_state, expr)
269                    .await?
270            }
271            PromExpr::NumberLiteral(lit) => self.prom_number_lit_to_plan(lit)?,
272            PromExpr::StringLiteral(lit) => self.prom_string_lit_to_plan(lit)?,
273            PromExpr::VectorSelector(selector) => {
274                self.prom_vector_selector_to_plan(selector, timestamp_fn)
275                    .await?
276            }
277            PromExpr::MatrixSelector(selector) => {
278                self.prom_matrix_selector_to_plan(selector).await?
279            }
280            PromExpr::Call(expr) => {
281                self.prom_call_expr_to_plan(query_engine_state, expr)
282                    .await?
283            }
284            PromExpr::Extension(expr) => {
285                self.prom_ext_expr_to_plan(query_engine_state, expr).await?
286            }
287        };
288
289        Ok(res)
290    }
291
292    async fn prom_subquery_expr_to_plan(
293        &mut self,
294        query_engine_state: &QueryEngineState,
295        subquery_expr: &SubqueryExpr,
296    ) -> Result<LogicalPlan> {
297        let SubqueryExpr {
298            expr, range, step, ..
299        } = subquery_expr;
300
301        let current_interval = self.ctx.interval;
302        if let Some(step) = step {
303            self.ctx.interval = step.as_millis() as _;
304        }
305        let current_start = self.ctx.start;
306        self.ctx.start -= range.as_millis() as i64 - self.ctx.interval;
307        let input = self.prom_expr_to_plan(expr, query_engine_state).await?;
308        self.ctx.interval = current_interval;
309        self.ctx.start = current_start;
310
311        ensure!(!range.is_zero(), ZeroRangeSelectorSnafu);
312        let range_ms = range.as_millis() as _;
313        self.ctx.range = Some(range_ms);
314
315        let time_index_column =
316            self.ctx
317                .time_index_column
318                .clone()
319                .with_context(|| TimeIndexNotFoundSnafu {
320                    table: self.ctx.table_name.clone().unwrap_or_default(),
321                })?;
322
323        // `RangeManipulate` assumes each input batch holds exactly one series
324        // (it takes tag column values from row 0 and applies them to every
325        // output row). The inner expression may emit batches that mix series,
326        // so sort by series key + time index and split into per-series batches
327        // with a `SeriesDivide` first.
328        let input_schema = input.schema();
329        let input_has_tsid = input_schema.fields().iter().any(|field| {
330            field.name() == DATA_SCHEMA_TSID_COLUMN_NAME
331                && field.data_type() == &ArrowDataType::UInt64
332        });
333        let (series_key_columns, mut sort_exprs) = if input_has_tsid {
334            (
335                vec![DATA_SCHEMA_TSID_COLUMN_NAME.to_string()],
336                vec![
337                    DfExpr::Column(Column::from_name(DATA_SCHEMA_TSID_COLUMN_NAME))
338                        .sort(true, true),
339                ],
340            )
341        } else {
342            // Only use tag columns that survive in the inner plan's schema —
343            // `ctx.tag_columns` can drift from the actual output.
344            let key_columns: Vec<String> = self
345                .ctx
346                .tag_columns
347                .iter()
348                .filter(|name| input_schema.has_column_with_unqualified_name(name))
349                .cloned()
350                .collect();
351            let sort = key_columns
352                .iter()
353                .map(|name| DfExpr::Column(Column::from_name(name)).sort(true, true))
354                .collect::<Vec<_>>();
355            (key_columns, sort)
356        };
357        sort_exprs.push(DfExpr::Column(Column::from_name(&time_index_column)).sort(true, true));
358
359        let sort_plan = LogicalPlanBuilder::from(input)
360            .sort(sort_exprs)
361            .context(DataFusionPlanningSnafu)?
362            .build()
363            .context(DataFusionPlanningSnafu)?;
364        let divide_plan = LogicalPlan::Extension(Extension {
365            node: Arc::new(SeriesDivide::new(
366                series_key_columns,
367                time_index_column.clone(),
368                sort_plan,
369            )),
370        });
371
372        let manipulate = RangeManipulate::new(
373            self.ctx.start,
374            self.ctx.end,
375            self.ctx.interval,
376            range_ms,
377            time_index_column,
378            self.ctx.field_columns.clone(),
379            divide_plan,
380        )
381        .context(DataFusionPlanningSnafu)?;
382
383        Ok(LogicalPlan::Extension(Extension {
384            node: Arc::new(manipulate),
385        }))
386    }
387
388    async fn prom_aggr_expr_to_plan(
389        &mut self,
390        query_engine_state: &QueryEngineState,
391        aggr_expr: &AggregateExpr,
392    ) -> Result<LogicalPlan> {
393        let AggregateExpr {
394            op,
395            expr,
396            modifier,
397            param,
398        } = aggr_expr;
399
400        let mut input = self.prom_expr_to_plan(expr, query_engine_state).await?;
401        let input_has_tsid = input.schema().fields().iter().any(|field| {
402            field.name() == DATA_SCHEMA_TSID_COLUMN_NAME
403                && field.data_type() == &ArrowDataType::UInt64
404        });
405
406        // `__tsid` based scan projection may prune tag columns. Ensure tags referenced in
407        // aggregation modifiers (`by`/`without`) are available before planning group keys.
408        let required_group_tags = match modifier {
409            None => BTreeSet::new(),
410            Some(LabelModifier::Include(labels)) => labels
411                .labels
412                .iter()
413                .filter(|label| !is_metric_engine_internal_column(label.as_str()))
414                .cloned()
415                .collect(),
416            Some(LabelModifier::Exclude(labels)) => {
417                let mut all_tags = self.collect_row_key_tag_columns_from_plan(&input)?;
418                for label in &labels.labels {
419                    let _ = all_tags.remove(label);
420                }
421                all_tags
422            }
423        };
424
425        if !required_group_tags.is_empty()
426            && required_group_tags
427                .iter()
428                .any(|tag| Self::find_case_sensitive_column(input.schema(), tag.as_str()).is_none())
429        {
430            input = self.ensure_tag_columns_available(input, &required_group_tags)?;
431            self.refresh_tag_columns_from_schema(input.schema());
432        }
433
434        match (*op).id() {
435            token::T_TOPK | token::T_BOTTOMK => {
436                self.prom_topk_bottomk_to_plan(aggr_expr, input).await
437            }
438            _ => {
439                // When `__tsid` is available, tag columns may have been pruned from the input plan.
440                // For `keep_tsid` decision we should compare against the full row-key label set,
441                // otherwise we may incorrectly treat label-reducing aggregates as preserving labels.
442                let input_tag_columns = if input_has_tsid {
443                    self.collect_row_key_tag_columns_from_plan(&input)?
444                        .into_iter()
445                        .collect::<Vec<_>>()
446                } else {
447                    self.ctx.tag_columns.clone()
448                };
449                // calculate columns to group by
450                // Need to append time index column into group by columns
451                let mut group_exprs = self.agg_modifier_to_col(input.schema(), modifier, true)?;
452                // convert op and value columns to aggregate exprs
453                let (mut aggr_exprs, prev_field_exprs) =
454                    self.create_aggregate_exprs(*op, param, &input)?;
455
456                let keep_tsid = op.id() != token::T_COUNT_VALUES
457                    && input_has_tsid
458                    && input_tag_columns.iter().collect::<HashSet<_>>()
459                        == self.ctx.tag_columns.iter().collect::<HashSet<_>>();
460
461                if keep_tsid {
462                    aggr_exprs.push(
463                        first_value(
464                            DfExpr::Column(Column::from_name(DATA_SCHEMA_TSID_COLUMN_NAME)),
465                            vec![],
466                        )
467                        .alias(DATA_SCHEMA_TSID_COLUMN_NAME),
468                    );
469                }
470                self.ctx.use_tsid = keep_tsid;
471
472                // create plan
473                let builder = LogicalPlanBuilder::from(input);
474                let builder = if op.id() == token::T_COUNT_VALUES {
475                    let label = Self::get_param_value_as_str(*op, param)?;
476                    // `count_values` must be grouped by fields,
477                    // and project the fields to the new label.
478                    group_exprs.extend(prev_field_exprs.clone());
479                    let project_fields = self
480                        .create_field_column_exprs()?
481                        .into_iter()
482                        .chain(self.create_tag_column_exprs()?)
483                        .chain(Some(self.create_time_index_column_expr()?))
484                        .chain(prev_field_exprs.into_iter().map(|expr| expr.alias(label)));
485
486                    builder
487                        .aggregate(group_exprs.clone(), aggr_exprs)
488                        .context(DataFusionPlanningSnafu)?
489                        .project(project_fields)
490                        .context(DataFusionPlanningSnafu)?
491                } else {
492                    builder
493                        .aggregate(group_exprs.clone(), aggr_exprs)
494                        .context(DataFusionPlanningSnafu)?
495                };
496
497                let sort_expr = group_exprs.into_iter().map(|expr| expr.sort(true, false));
498
499                builder
500                    .sort(sort_expr)
501                    .context(DataFusionPlanningSnafu)?
502                    .build()
503                    .context(DataFusionPlanningSnafu)
504            }
505        }
506    }
507
508    /// Create logical plan for PromQL topk and bottomk expr.
509    async fn prom_topk_bottomk_to_plan(
510        &mut self,
511        aggr_expr: &AggregateExpr,
512        input: LogicalPlan,
513    ) -> Result<LogicalPlan> {
514        let AggregateExpr {
515            op,
516            param,
517            modifier,
518            ..
519        } = aggr_expr;
520
521        let input_has_tsid = input.schema().fields().iter().any(|field| {
522            field.name() == DATA_SCHEMA_TSID_COLUMN_NAME
523                && field.data_type() == &ArrowDataType::UInt64
524        });
525        self.ctx.use_tsid = input_has_tsid;
526
527        let group_exprs = self.agg_modifier_to_col(input.schema(), modifier, false)?;
528
529        let val = Self::get_param_as_literal_expr(param, Some(*op), Some(ArrowDataType::Float64))?;
530
531        // convert op and value columns to window exprs.
532        let window_exprs = self.create_window_exprs(*op, group_exprs.clone(), &input)?;
533
534        let rank_columns: Vec<_> = window_exprs
535            .iter()
536            .map(|expr| expr.schema_name().to_string())
537            .collect();
538
539        // Create ranks filter with `Operator::Or`.
540        // Safety: at least one rank column
541        let filter: DfExpr = rank_columns
542            .iter()
543            .fold(None, |expr, rank| {
544                let predicate = DfExpr::BinaryExpr(BinaryExpr {
545                    left: Box::new(col(rank)),
546                    op: Operator::LtEq,
547                    right: Box::new(val.clone()),
548                });
549
550                match expr {
551                    None => Some(predicate),
552                    Some(expr) => Some(DfExpr::BinaryExpr(BinaryExpr {
553                        left: Box::new(expr),
554                        op: Operator::Or,
555                        right: Box::new(predicate),
556                    })),
557                }
558            })
559            .unwrap();
560
561        let rank_columns: Vec<_> = rank_columns.into_iter().map(col).collect();
562
563        let mut new_group_exprs = group_exprs.clone();
564        // Order by ranks
565        new_group_exprs.extend(rank_columns);
566
567        let group_sort_expr = new_group_exprs
568            .into_iter()
569            .map(|expr| expr.sort(true, false));
570
571        let project_fields = self
572            .create_field_column_exprs()?
573            .into_iter()
574            .chain(self.create_tag_column_exprs()?)
575            .chain(
576                self.ctx
577                    .use_tsid
578                    .then_some(DfExpr::Column(Column::from_name(
579                        DATA_SCHEMA_TSID_COLUMN_NAME,
580                    ))),
581            )
582            .chain(Some(self.create_time_index_column_expr()?));
583
584        LogicalPlanBuilder::from(input)
585            .window(window_exprs)
586            .context(DataFusionPlanningSnafu)?
587            .filter(filter)
588            .context(DataFusionPlanningSnafu)?
589            .sort(group_sort_expr)
590            .context(DataFusionPlanningSnafu)?
591            .project(project_fields)
592            .context(DataFusionPlanningSnafu)?
593            .build()
594            .context(DataFusionPlanningSnafu)
595    }
596
597    async fn prom_unary_expr_to_plan(
598        &mut self,
599        query_engine_state: &QueryEngineState,
600        unary_expr: &UnaryExpr,
601    ) -> Result<LogicalPlan> {
602        let UnaryExpr { expr } = unary_expr;
603        // Unary Expr in PromQL implys the `-` operator
604        let input = self.prom_expr_to_plan(expr, query_engine_state).await?;
605        self.projection_for_each_field_column(input, |col| {
606            Ok(DfExpr::Negative(Box::new(DfExpr::Column(col.into()))))
607        })
608    }
609
610    async fn prom_binary_expr_to_plan(
611        &mut self,
612        query_engine_state: &QueryEngineState,
613        binary_expr: &PromBinaryExpr,
614    ) -> Result<LogicalPlan> {
615        let PromBinaryExpr {
616            lhs,
617            rhs,
618            op,
619            modifier,
620        } = binary_expr;
621
622        // if set to true, comparison operator will return 0/1 (for true/false) instead of
623        // filter on the result column
624        let should_return_bool = if let Some(m) = modifier {
625            m.return_bool
626        } else {
627            false
628        };
629        let is_comparison_op = Self::is_token_a_comparison_op(*op);
630
631        // we should build a filter plan here if the op is comparison op and need not
632        // to return 0/1. Otherwise, we should build a projection plan
633        match (
634            Self::try_build_literal_expr(lhs),
635            Self::try_build_literal_expr(rhs),
636        ) {
637            (Some(lhs), Some(rhs)) => {
638                self.ctx.time_index_column = Some(DEFAULT_TIME_INDEX_COLUMN.to_string());
639                self.ctx.field_columns = vec![DEFAULT_FIELD_COLUMN.to_string()];
640                self.ctx.reset_table_name_and_schema();
641                let field_expr_builder = Self::prom_token_to_binary_expr_builder(*op)?;
642                let mut field_expr = field_expr_builder(lhs, rhs)?;
643
644                if is_comparison_op && should_return_bool {
645                    field_expr = DfExpr::Cast(Cast {
646                        expr: Box::new(field_expr),
647                        data_type: ArrowDataType::Float64,
648                    });
649                }
650
651                Ok(LogicalPlan::Extension(Extension {
652                    node: Arc::new(
653                        EmptyMetric::new(
654                            self.ctx.start,
655                            self.ctx.end,
656                            self.ctx.interval,
657                            SPECIAL_TIME_FUNCTION.to_string(),
658                            DEFAULT_FIELD_COLUMN.to_string(),
659                            Some(field_expr),
660                        )
661                        .context(DataFusionPlanningSnafu)?,
662                    ),
663                }))
664            }
665            // lhs is a literal, rhs is a column
666            (Some(mut expr), None) => {
667                let input = self.prom_expr_to_plan(rhs, query_engine_state).await?;
668                // check if the literal is a special time expr
669                if let Some(time_expr) = self.try_build_special_time_expr_with_context(lhs) {
670                    expr = time_expr
671                }
672                let bin_expr_builder = |col: &String| {
673                    let binary_expr_builder = Self::prom_token_to_binary_expr_builder(*op)?;
674                    let mut binary_expr =
675                        binary_expr_builder(expr.clone(), DfExpr::Column(col.into()))?;
676
677                    if is_comparison_op && should_return_bool {
678                        binary_expr = DfExpr::Cast(Cast {
679                            expr: Box::new(binary_expr),
680                            data_type: ArrowDataType::Float64,
681                        });
682                    }
683                    Ok(binary_expr)
684                };
685                if is_comparison_op && !should_return_bool {
686                    self.filter_on_field_column(input, bin_expr_builder)
687                } else {
688                    self.projection_for_each_field_column(input, bin_expr_builder)
689                }
690            }
691            // lhs is a column, rhs is a literal
692            (None, Some(mut expr)) => {
693                let input = self.prom_expr_to_plan(lhs, query_engine_state).await?;
694                // check if the literal is a special time expr
695                if let Some(time_expr) = self.try_build_special_time_expr_with_context(rhs) {
696                    expr = time_expr
697                }
698                let bin_expr_builder = |col: &String| {
699                    let binary_expr_builder = Self::prom_token_to_binary_expr_builder(*op)?;
700                    let mut binary_expr =
701                        binary_expr_builder(DfExpr::Column(col.into()), expr.clone())?;
702
703                    if is_comparison_op && should_return_bool {
704                        binary_expr = DfExpr::Cast(Cast {
705                            expr: Box::new(binary_expr),
706                            data_type: ArrowDataType::Float64,
707                        });
708                    }
709                    Ok(binary_expr)
710                };
711                if is_comparison_op && !should_return_bool {
712                    self.filter_on_field_column(input, bin_expr_builder)
713                } else {
714                    self.projection_for_each_field_column(input, bin_expr_builder)
715                }
716            }
717            // both are columns. join them on time index
718            (None, None) => {
719                let left_input = self.prom_expr_to_plan(lhs, query_engine_state).await?;
720                let left_field_columns = self.ctx.field_columns.clone();
721                let left_time_index_column = self.ctx.time_index_column.clone();
722                let mut left_table_ref = self
723                    .table_ref()
724                    .unwrap_or_else(|_| TableReference::bare(""));
725                let left_context = self.ctx.clone();
726
727                let right_input = self.prom_expr_to_plan(rhs, query_engine_state).await?;
728                let right_field_columns = self.ctx.field_columns.clone();
729                let right_time_index_column = self.ctx.time_index_column.clone();
730                let mut right_table_ref = self
731                    .table_ref()
732                    .unwrap_or_else(|_| TableReference::bare(""));
733                let right_context = self.ctx.clone();
734
735                // TODO(ruihang): avoid join if left and right are the same table
736
737                // set op has "special" join semantics
738                if Self::is_token_a_set_op(*op) {
739                    return self.set_op_on_non_field_columns(
740                        left_input,
741                        right_input,
742                        left_context,
743                        right_context,
744                        *op,
745                        modifier,
746                    );
747                }
748
749                // normal join
750                if left_table_ref == right_table_ref {
751                    // rename table references to avoid ambiguity
752                    left_table_ref = TableReference::bare("lhs");
753                    right_table_ref = TableReference::bare("rhs");
754                    // `self.ctx` have ctx in right plan, if right plan have no tag,
755                    // we use left plan ctx as the ctx for subsequent calculations,
756                    // to avoid case like `host + scalar(...)`
757                    // we need preserve tag column on `host` table in subsequent projection,
758                    // which only show in left plan ctx.
759                    if self.ctx.tag_columns.is_empty() {
760                        self.ctx = left_context.clone();
761                        self.ctx.table_name = Some("lhs".to_string());
762                    } else {
763                        self.ctx.table_name = Some("rhs".to_string());
764                    }
765                }
766                let (output_field_columns, field_columns) =
767                    Self::align_binary_field_columns(&left_field_columns, &right_field_columns);
768                // PromQL binary arithmetic only combines the shared prefix of value columns.
769                // Keep the output field count aligned with that zipped prefix so planning
770                // remains stable even when the two sides have uneven multi-field schemas.
771                self.ctx.field_columns = output_field_columns;
772                let mut field_columns = field_columns.into_iter();
773
774                let join_plan = self.join_on_non_field_columns(
775                    left_input,
776                    right_input,
777                    left_table_ref.clone(),
778                    right_table_ref.clone(),
779                    left_time_index_column,
780                    right_time_index_column,
781                    // if left plan or right plan tag is empty, means case like `scalar(...) + host` or `host + scalar(...)`
782                    // under this case we only join on time index
783                    left_context.tag_columns.is_empty() || right_context.tag_columns.is_empty(),
784                    modifier,
785                )?;
786                let join_plan_schema = join_plan.schema().clone();
787
788                let bin_expr_builder = |_: &String| {
789                    let (left_col_name, right_col_name) = field_columns.next().unwrap();
790                    let left_col = join_plan_schema
791                        .qualified_field_with_name(Some(&left_table_ref), left_col_name)
792                        .context(DataFusionPlanningSnafu)?
793                        .into();
794                    let right_col = join_plan_schema
795                        .qualified_field_with_name(Some(&right_table_ref), right_col_name)
796                        .context(DataFusionPlanningSnafu)?
797                        .into();
798
799                    let binary_expr_builder = Self::prom_token_to_binary_expr_builder(*op)?;
800                    let mut binary_expr =
801                        binary_expr_builder(DfExpr::Column(left_col), DfExpr::Column(right_col))?;
802                    if is_comparison_op && should_return_bool {
803                        binary_expr = DfExpr::Cast(Cast {
804                            expr: Box::new(binary_expr),
805                            data_type: ArrowDataType::Float64,
806                        });
807                    }
808                    Ok(binary_expr)
809                };
810                if is_comparison_op && !should_return_bool {
811                    // PromQL comparison operators without `bool` are filters:
812                    //   - keep the instant-vector side sample values
813                    //   - drop samples where the comparison is false
814                    //
815                    // So we filter on the join result and then project only the side that should
816                    // be preserved according to PromQL semantics.
817                    let filtered = self.filter_on_field_column(join_plan, bin_expr_builder)?;
818                    let (project_table_ref, project_context) =
819                        match (lhs.value_type(), rhs.value_type()) {
820                            (ValueType::Scalar, ValueType::Vector) => {
821                                (&right_table_ref, &right_context)
822                            }
823                            _ => (&left_table_ref, &left_context),
824                        };
825                    self.project_binary_join_side(filtered, project_table_ref, project_context)
826                } else {
827                    self.projection_for_each_field_column(join_plan, bin_expr_builder)
828                }
829            }
830        }
831    }
832
833    fn project_binary_join_side(
834        &mut self,
835        input: LogicalPlan,
836        table_ref: &TableReference,
837        context: &PromPlannerContext,
838    ) -> Result<LogicalPlan> {
839        let schema = input.schema();
840
841        let mut project_exprs =
842            Vec::with_capacity(context.tag_columns.len() + context.field_columns.len() + 2);
843
844        // Project time index from the chosen side.
845        if let Some(time_index_column) = &context.time_index_column {
846            let time_index_col = schema
847                .qualified_field_with_name(Some(table_ref), time_index_column)
848                .context(DataFusionPlanningSnafu)?
849                .into();
850            project_exprs.push(DfExpr::Column(time_index_col));
851        }
852
853        // Project field columns from the chosen side.
854        for field_column in &context.field_columns {
855            let field_col = schema
856                .qualified_field_with_name(Some(table_ref), field_column)
857                .context(DataFusionPlanningSnafu)?
858                .into();
859            project_exprs.push(DfExpr::Column(field_col));
860        }
861
862        // Project tag columns from the chosen side.
863        for tag_column in &context.tag_columns {
864            let tag_col = schema
865                .qualified_field_with_name(Some(table_ref), tag_column)
866                .context(DataFusionPlanningSnafu)?
867                .into();
868            project_exprs.push(DfExpr::Column(tag_col));
869        }
870
871        // Preserve `__tsid` if present, so it can still be used internally downstream. It's
872        // stripped from the final output anyway.
873        if let Some(tsid_col) =
874            Self::optional_tsid_projection(schema, Some(table_ref), context.use_tsid)
875        {
876            project_exprs.push(tsid_col);
877        }
878
879        let plan = LogicalPlanBuilder::from(input)
880            .project(project_exprs)
881            .context(DataFusionPlanningSnafu)?
882            .build()
883            .context(DataFusionPlanningSnafu)?;
884
885        // Update context to reflect the projected schema. Don't keep a table qualifier since
886        // the result is a derived expression.
887        self.ctx = context.clone();
888        self.ctx.table_name = None;
889        self.ctx.schema_name = None;
890
891        Ok(plan)
892    }
893
894    fn prom_number_lit_to_plan(&mut self, number_literal: &NumberLiteral) -> Result<LogicalPlan> {
895        let NumberLiteral { val } = number_literal;
896        self.ctx.time_index_column = Some(DEFAULT_TIME_INDEX_COLUMN.to_string());
897        self.ctx.field_columns = vec![DEFAULT_FIELD_COLUMN.to_string()];
898        self.ctx.reset_table_name_and_schema();
899        let literal_expr = df_prelude::lit(*val);
900
901        let plan = LogicalPlan::Extension(Extension {
902            node: Arc::new(
903                EmptyMetric::new(
904                    self.ctx.start,
905                    self.ctx.end,
906                    self.ctx.interval,
907                    SPECIAL_TIME_FUNCTION.to_string(),
908                    DEFAULT_FIELD_COLUMN.to_string(),
909                    Some(literal_expr),
910                )
911                .context(DataFusionPlanningSnafu)?,
912            ),
913        });
914        Ok(plan)
915    }
916
917    fn prom_string_lit_to_plan(&mut self, string_literal: &StringLiteral) -> Result<LogicalPlan> {
918        let StringLiteral { val } = string_literal;
919        self.ctx.time_index_column = Some(DEFAULT_TIME_INDEX_COLUMN.to_string());
920        self.ctx.field_columns = vec![DEFAULT_FIELD_COLUMN.to_string()];
921        self.ctx.reset_table_name_and_schema();
922        let literal_expr = df_prelude::lit(val.clone());
923
924        let plan = LogicalPlan::Extension(Extension {
925            node: Arc::new(
926                EmptyMetric::new(
927                    self.ctx.start,
928                    self.ctx.end,
929                    self.ctx.interval,
930                    SPECIAL_TIME_FUNCTION.to_string(),
931                    DEFAULT_FIELD_COLUMN.to_string(),
932                    Some(literal_expr),
933                )
934                .context(DataFusionPlanningSnafu)?,
935            ),
936        });
937        Ok(plan)
938    }
939
940    async fn prom_vector_selector_to_plan(
941        &mut self,
942        vector_selector: &VectorSelector,
943        timestamp_fn: bool,
944    ) -> Result<LogicalPlan> {
945        let VectorSelector {
946            name,
947            offset,
948            matchers,
949            at: _,
950        } = vector_selector;
951        let matchers = self.preprocess_label_matchers(matchers, name)?;
952        if let Some(empty_plan) = self.setup_context().await? {
953            return Ok(empty_plan);
954        }
955        let normalize = self
956            .selector_to_series_normalize_plan(offset, matchers, false)
957            .await?;
958
959        let normalize = if timestamp_fn {
960            // If evaluating the PromQL `timestamp()` function, project the time index column as the value column
961            // before wrapping with [`InstantManipulate`], so the output matches PromQL's `timestamp()` semantics.
962            self.create_timestamp_func_plan(normalize)?
963        } else {
964            normalize
965        };
966
967        let manipulate = InstantManipulate::new(
968            self.ctx.start,
969            self.ctx.end,
970            self.ctx.lookback_delta,
971            self.ctx.interval,
972            self.ctx
973                .time_index_column
974                .clone()
975                .expect("time index should be set in `setup_context`"),
976            if self.ctx.use_tsid {
977                vec![DATA_SCHEMA_TSID_COLUMN_NAME.to_string()]
978            } else {
979                self.ctx.tag_columns.clone()
980            },
981            self.ctx.field_columns.first().cloned(),
982            normalize,
983        );
984        Ok(LogicalPlan::Extension(Extension {
985            node: Arc::new(manipulate),
986        }))
987    }
988
989    /// Builds a projection plan for the PromQL `timestamp()` function.
990    /// Projects the time index column as the value column for each row.
991    ///
992    /// # Arguments
993    /// * `normalize` - Input [`LogicalPlan`] for the normalized series.
994    ///
995    /// # Returns
996    /// Returns a [`Result<LogicalPlan>`] where the resulting logical plan projects the timestamp
997    /// column as the value column, along with the original tag and time index columns.
998    ///
999    /// # Timestamp vs. Time Function
1000    ///
1001    /// - **Timestamp Function (`timestamp()`)**: In PromQL, the `timestamp()` function returns the
1002    ///   timestamp (time index) of each sample as the value column.
1003    ///
1004    /// - **Time Function (`time()`)**: The `time()` function returns the evaluation time of the query
1005    ///   as a scalar value.
1006    ///
1007    /// # Side Effects
1008    /// Updates the planner context's field columns to the timestamp column name.
1009    ///
1010    fn create_timestamp_func_plan(&mut self, normalize: LogicalPlan) -> Result<LogicalPlan> {
1011        let time_expr = build_special_time_expr(self.ctx.time_index_column.as_ref().unwrap())
1012            .alias(DEFAULT_FIELD_COLUMN);
1013        self.ctx.field_columns = vec![time_expr.schema_name().to_string()];
1014        let mut project_exprs = Vec::with_capacity(self.ctx.tag_columns.len() + 2);
1015        project_exprs.push(self.create_time_index_column_expr()?);
1016        project_exprs.push(time_expr);
1017        project_exprs.extend(self.create_tag_column_exprs()?);
1018
1019        LogicalPlanBuilder::from(normalize)
1020            .project(project_exprs)
1021            .context(DataFusionPlanningSnafu)?
1022            .build()
1023            .context(DataFusionPlanningSnafu)
1024    }
1025
1026    async fn prom_matrix_selector_to_plan(
1027        &mut self,
1028        matrix_selector: &MatrixSelector,
1029    ) -> Result<LogicalPlan> {
1030        let MatrixSelector { vs, range } = matrix_selector;
1031        let VectorSelector {
1032            name,
1033            offset,
1034            matchers,
1035            ..
1036        } = vs;
1037        let matchers = self.preprocess_label_matchers(matchers, name)?;
1038        ensure!(!range.is_zero(), ZeroRangeSelectorSnafu);
1039        let range_ms = range.as_millis() as _;
1040        self.ctx.range = Some(range_ms);
1041
1042        // Some functions like rate may require special fields in the RangeManipulate plan
1043        // so we can't skip RangeManipulate.
1044        let normalize = match self.setup_context().await? {
1045            Some(empty_plan) => empty_plan,
1046            None => {
1047                self.selector_to_series_normalize_plan(offset, matchers, true)
1048                    .await?
1049            }
1050        };
1051        let manipulate = RangeManipulate::new(
1052            self.ctx.start,
1053            self.ctx.end,
1054            self.ctx.interval,
1055            // TODO(ruihang): convert via Timestamp datatypes to support different time units
1056            range_ms,
1057            self.ctx
1058                .time_index_column
1059                .clone()
1060                .expect("time index should be set in `setup_context`"),
1061            self.ctx.field_columns.clone(),
1062            normalize,
1063        )
1064        .context(DataFusionPlanningSnafu)?;
1065
1066        Ok(LogicalPlan::Extension(Extension {
1067            node: Arc::new(manipulate),
1068        }))
1069    }
1070
1071    async fn prom_call_expr_to_plan(
1072        &mut self,
1073        query_engine_state: &QueryEngineState,
1074        call_expr: &Call,
1075    ) -> Result<LogicalPlan> {
1076        let Call { func, args } = call_expr;
1077        // some special functions that are not expression but a plan
1078        match func.name {
1079            SPECIAL_HISTOGRAM_QUANTILE => {
1080                return self.create_histogram_plan(args, query_engine_state).await;
1081            }
1082            SPECIAL_VECTOR_FUNCTION => return self.create_vector_plan(args).await,
1083            SCALAR_FUNCTION => return self.create_scalar_plan(args, query_engine_state).await,
1084            SPECIAL_ABSENT_FUNCTION => {
1085                return self.create_absent_plan(args, query_engine_state).await;
1086            }
1087            _ => {}
1088        }
1089
1090        // transform function arguments
1091        let args = self.create_function_args(&args.args)?;
1092        let input = if let Some(prom_expr) = &args.input {
1093            self.prom_expr_to_plan_inner(prom_expr, func.name == "timestamp", query_engine_state)
1094                .await?
1095        } else {
1096            self.ctx.time_index_column = Some(SPECIAL_TIME_FUNCTION.to_string());
1097            self.ctx.reset_table_name_and_schema();
1098            self.ctx.tag_columns = vec![];
1099            self.ctx.field_columns = vec![DEFAULT_FIELD_COLUMN.to_string()];
1100            LogicalPlan::Extension(Extension {
1101                node: Arc::new(
1102                    EmptyMetric::new(
1103                        self.ctx.start,
1104                        self.ctx.end,
1105                        self.ctx.interval,
1106                        SPECIAL_TIME_FUNCTION.to_string(),
1107                        DEFAULT_FIELD_COLUMN.to_string(),
1108                        None,
1109                    )
1110                    .context(DataFusionPlanningSnafu)?,
1111                ),
1112            })
1113        };
1114        let (mut func_exprs, new_tags) =
1115            self.create_function_expr(func, args.literals.clone(), query_engine_state)?;
1116        func_exprs.insert(0, self.create_time_index_column_expr()?);
1117        func_exprs.extend_from_slice(&self.create_tag_column_exprs()?);
1118
1119        let builder = LogicalPlanBuilder::from(input)
1120            .project(func_exprs)
1121            .context(DataFusionPlanningSnafu)?
1122            .filter(self.create_empty_values_filter_expr()?)
1123            .context(DataFusionPlanningSnafu)?;
1124
1125        let builder = match func.name {
1126            "sort" => builder
1127                .sort(self.create_field_columns_sort_exprs(true))
1128                .context(DataFusionPlanningSnafu)?,
1129            "sort_desc" => builder
1130                .sort(self.create_field_columns_sort_exprs(false))
1131                .context(DataFusionPlanningSnafu)?,
1132            "sort_by_label" => builder
1133                .sort(Self::create_sort_exprs_by_tags(
1134                    func.name,
1135                    args.literals,
1136                    true,
1137                )?)
1138                .context(DataFusionPlanningSnafu)?,
1139            "sort_by_label_desc" => builder
1140                .sort(Self::create_sort_exprs_by_tags(
1141                    func.name,
1142                    args.literals,
1143                    false,
1144                )?)
1145                .context(DataFusionPlanningSnafu)?,
1146
1147            _ => builder,
1148        };
1149
1150        // Update context tags after building plan
1151        // We can't push them before planning, because they won't exist until projection.
1152        for tag in new_tags {
1153            self.ctx.tag_columns.push(tag);
1154        }
1155
1156        let plan = builder.build().context(DataFusionPlanningSnafu)?;
1157        common_telemetry::debug!("Created PromQL function plan: {plan:?} for {call_expr:?}");
1158
1159        Ok(plan)
1160    }
1161
1162    async fn prom_ext_expr_to_plan(
1163        &mut self,
1164        query_engine_state: &QueryEngineState,
1165        ext_expr: &promql_parser::parser::ast::Extension,
1166    ) -> Result<LogicalPlan> {
1167        // let promql_parser::parser::ast::Extension { expr } = ext_expr;
1168        let expr = &ext_expr.expr;
1169        let children = expr.children();
1170        let plan = self
1171            .prom_expr_to_plan(&children[0], query_engine_state)
1172            .await?;
1173        // Wrapper for the explanation/analyze of the existing plan
1174        // https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/builder/struct.LogicalPlanBuilder.html#method.explain
1175        // if `analyze` is true, runs the actual plan and produces
1176        // information about metrics during run.
1177        // if `verbose` is true, prints out additional details when VERBOSE keyword is specified
1178        match expr.name() {
1179            ANALYZE_NODE_NAME => LogicalPlanBuilder::from(plan)
1180                .explain(false, true)
1181                .unwrap()
1182                .build()
1183                .context(DataFusionPlanningSnafu),
1184            ANALYZE_VERBOSE_NODE_NAME => LogicalPlanBuilder::from(plan)
1185                .explain(true, true)
1186                .unwrap()
1187                .build()
1188                .context(DataFusionPlanningSnafu),
1189            EXPLAIN_NODE_NAME => LogicalPlanBuilder::from(plan)
1190                .explain(false, false)
1191                .unwrap()
1192                .build()
1193                .context(DataFusionPlanningSnafu),
1194            EXPLAIN_VERBOSE_NODE_NAME => LogicalPlanBuilder::from(plan)
1195                .explain(true, false)
1196                .unwrap()
1197                .build()
1198                .context(DataFusionPlanningSnafu),
1199            ALIAS_NODE_NAME => {
1200                let alias = expr
1201                    .as_any()
1202                    .downcast_ref::<AliasExpr>()
1203                    .context(UnexpectedPlanExprSnafu {
1204                        desc: "Expected AliasExpr",
1205                    })?
1206                    .alias
1207                    .clone();
1208                self.apply_alias(plan, alias)
1209            }
1210            _ => LogicalPlanBuilder::empty(true)
1211                .build()
1212                .context(DataFusionPlanningSnafu),
1213        }
1214    }
1215
1216    /// Extract metric name from `__name__` matcher and set it into [PromPlannerContext].
1217    /// Returns a new [Matchers] that doesn't contain metric name matcher.
1218    ///
1219    /// Each call to this function means new selector is started. Thus, the context will be reset
1220    /// at first.
1221    ///
1222    /// Name rule:
1223    /// - if `name` is some, then the matchers MUST NOT contain `__name__` matcher.
1224    /// - if `name` is none, then the matchers MAY contain NONE OR MULTIPLE `__name__` matchers.
1225    #[allow(clippy::mutable_key_type)]
1226    fn preprocess_label_matchers(
1227        &mut self,
1228        label_matchers: &Matchers,
1229        name: &Option<String>,
1230    ) -> Result<Matchers> {
1231        self.ctx.reset();
1232
1233        let metric_name;
1234        if let Some(name) = name.clone() {
1235            metric_name = Some(name);
1236            ensure!(
1237                label_matchers.find_matchers(METRIC_NAME).is_empty(),
1238                MultipleMetricMatchersSnafu
1239            );
1240        } else {
1241            let mut matches = label_matchers.find_matchers(METRIC_NAME);
1242            ensure!(!matches.is_empty(), NoMetricMatcherSnafu);
1243            ensure!(matches.len() == 1, MultipleMetricMatchersSnafu);
1244            ensure!(
1245                matches[0].op == MatchOp::Equal,
1246                UnsupportedMatcherOpSnafu {
1247                    matcher_op: matches[0].op.to_string(),
1248                    matcher: METRIC_NAME
1249                }
1250            );
1251            metric_name = matches.pop().map(|m| m.value);
1252        }
1253
1254        self.ctx.table_name = metric_name;
1255
1256        let mut matchers = HashSet::new();
1257        for matcher in &label_matchers.matchers {
1258            // TODO(ruihang): support other metric match ops
1259            if matcher.name == FIELD_COLUMN_MATCHER {
1260                self.ctx
1261                    .field_column_matcher
1262                    .get_or_insert_default()
1263                    .push(matcher.clone());
1264            } else if matcher.name == SCHEMA_COLUMN_MATCHER || matcher.name == DB_COLUMN_MATCHER {
1265                ensure!(
1266                    matcher.op == MatchOp::Equal,
1267                    UnsupportedMatcherOpSnafu {
1268                        matcher: matcher.name.clone(),
1269                        matcher_op: matcher.op.to_string(),
1270                    }
1271                );
1272                self.ctx.schema_name = Some(matcher.value.clone());
1273            } else if matcher.name != METRIC_NAME {
1274                self.ctx.selector_matcher.push(matcher.clone());
1275                let _ = matchers.insert(matcher.clone());
1276            }
1277        }
1278
1279        Ok(Matchers::new(matchers.into_iter().collect()))
1280    }
1281
1282    async fn selector_to_series_normalize_plan(
1283        &mut self,
1284        offset: &Option<Offset>,
1285        label_matchers: Matchers,
1286        is_range_selector: bool,
1287    ) -> Result<LogicalPlan> {
1288        // make table scan plan
1289        let table_ref = self.table_ref()?;
1290        let mut table_scan = self.create_table_scan_plan(table_ref.clone()).await?;
1291        let table_schema = table_scan.schema();
1292
1293        // make filter exprs
1294        let offset_duration = match offset {
1295            Some(Offset::Pos(duration)) => duration.as_millis() as Millisecond,
1296            Some(Offset::Neg(duration)) => -(duration.as_millis() as Millisecond),
1297            None => 0,
1298        };
1299        let mut scan_filters = Self::matchers_to_expr(label_matchers.clone(), table_schema)?;
1300        if let Some(time_index_filter) = self.build_time_index_filter(offset_duration)? {
1301            scan_filters.push(time_index_filter);
1302        }
1303        table_scan = LogicalPlanBuilder::from(table_scan)
1304            .filter(conjunction(scan_filters).unwrap()) // Safety: `scan_filters` is not empty.
1305            .context(DataFusionPlanningSnafu)?
1306            .build()
1307            .context(DataFusionPlanningSnafu)?;
1308
1309        // make a projection plan if there is any `__field__` matcher
1310        if let Some(field_matchers) = &self.ctx.field_column_matcher {
1311            let col_set = self.ctx.field_columns.iter().collect::<HashSet<_>>();
1312            // opt-in set
1313            let mut result_set = HashSet::new();
1314            // opt-out set
1315            let mut reverse_set = HashSet::new();
1316            for matcher in field_matchers {
1317                match &matcher.op {
1318                    MatchOp::Equal => {
1319                        if col_set.contains(&matcher.value) {
1320                            let _ = result_set.insert(matcher.value.clone());
1321                        } else {
1322                            return Err(ColumnNotFoundSnafu {
1323                                col: matcher.value.clone(),
1324                            }
1325                            .build());
1326                        }
1327                    }
1328                    MatchOp::NotEqual => {
1329                        if col_set.contains(&matcher.value) {
1330                            let _ = reverse_set.insert(matcher.value.clone());
1331                        } else {
1332                            return Err(ColumnNotFoundSnafu {
1333                                col: matcher.value.clone(),
1334                            }
1335                            .build());
1336                        }
1337                    }
1338                    MatchOp::Re(regex) => {
1339                        for col in &self.ctx.field_columns {
1340                            if regex.is_match(col) {
1341                                let _ = result_set.insert(col.clone());
1342                            }
1343                        }
1344                    }
1345                    MatchOp::NotRe(regex) => {
1346                        for col in &self.ctx.field_columns {
1347                            if regex.is_match(col) {
1348                                let _ = reverse_set.insert(col.clone());
1349                            }
1350                        }
1351                    }
1352                }
1353            }
1354            // merge two set
1355            if result_set.is_empty() {
1356                result_set = col_set.into_iter().cloned().collect();
1357            }
1358            for col in reverse_set {
1359                let _ = result_set.remove(&col);
1360            }
1361
1362            // mask the field columns in context using computed result set
1363            self.ctx.field_columns = self
1364                .ctx
1365                .field_columns
1366                .drain(..)
1367                .filter(|col| result_set.contains(col))
1368                .collect();
1369
1370            let exprs = result_set
1371                .into_iter()
1372                .map(|col| DfExpr::Column(Column::new_unqualified(col)))
1373                .chain(self.create_tag_column_exprs()?)
1374                .chain(
1375                    self.ctx
1376                        .use_tsid
1377                        .then_some(DfExpr::Column(Column::new_unqualified(
1378                            DATA_SCHEMA_TSID_COLUMN_NAME,
1379                        ))),
1380                )
1381                .chain(Some(self.create_time_index_column_expr()?))
1382                .collect::<Vec<_>>();
1383
1384            // reuse this variable for simplicity
1385            table_scan = LogicalPlanBuilder::from(table_scan)
1386                .project(exprs)
1387                .context(DataFusionPlanningSnafu)?
1388                .build()
1389                .context(DataFusionPlanningSnafu)?;
1390        }
1391
1392        // make sort plan
1393        let series_key_columns = if self.ctx.use_tsid {
1394            vec![DATA_SCHEMA_TSID_COLUMN_NAME.to_string()]
1395        } else {
1396            self.ctx.tag_columns.clone()
1397        };
1398
1399        let sort_exprs = if self.ctx.use_tsid {
1400            vec![
1401                DfExpr::Column(Column::from_name(DATA_SCHEMA_TSID_COLUMN_NAME)).sort(true, true),
1402                self.create_time_index_column_expr()?.sort(true, true),
1403            ]
1404        } else {
1405            self.create_tag_and_time_index_column_sort_exprs()?
1406        };
1407
1408        let sort_plan = LogicalPlanBuilder::from(table_scan)
1409            .sort(sort_exprs)
1410            .context(DataFusionPlanningSnafu)?
1411            .build()
1412            .context(DataFusionPlanningSnafu)?;
1413
1414        // make divide plan
1415        let time_index_column =
1416            self.ctx
1417                .time_index_column
1418                .clone()
1419                .with_context(|| TimeIndexNotFoundSnafu {
1420                    table: table_ref.to_string(),
1421                })?;
1422        let divide_plan = LogicalPlan::Extension(Extension {
1423            node: Arc::new(SeriesDivide::new(
1424                series_key_columns.clone(),
1425                time_index_column,
1426                sort_plan,
1427            )),
1428        });
1429
1430        // make series_normalize plan
1431        if !is_range_selector && offset_duration == 0 {
1432            return Ok(divide_plan);
1433        }
1434        let series_normalize = SeriesNormalize::new(
1435            offset_duration,
1436            self.ctx
1437                .time_index_column
1438                .clone()
1439                .with_context(|| TimeIndexNotFoundSnafu {
1440                    table: table_ref.to_quoted_string(),
1441                })?,
1442            is_range_selector,
1443            series_key_columns,
1444            divide_plan,
1445        );
1446        let logical_plan = LogicalPlan::Extension(Extension {
1447            node: Arc::new(series_normalize),
1448        });
1449
1450        Ok(logical_plan)
1451    }
1452
1453    /// Convert [LabelModifier] to [Column] exprs for aggregation.
1454    /// Timestamp column and tag columns will be included.
1455    ///
1456    /// # Side effect
1457    ///
1458    /// This method will also change the tag columns in ctx if `update_ctx` is true.
1459    fn agg_modifier_to_col(
1460        &mut self,
1461        input_schema: &DFSchemaRef,
1462        modifier: &Option<LabelModifier>,
1463        update_ctx: bool,
1464    ) -> Result<Vec<DfExpr>> {
1465        match modifier {
1466            None => {
1467                if update_ctx {
1468                    self.ctx.tag_columns.clear();
1469                }
1470                Ok(vec![self.create_time_index_column_expr()?])
1471            }
1472            Some(LabelModifier::Include(labels)) => {
1473                if update_ctx {
1474                    self.ctx.tag_columns.clear();
1475                }
1476                let mut exprs = Vec::with_capacity(labels.labels.len());
1477                for label in &labels.labels {
1478                    if is_metric_engine_internal_column(label) {
1479                        continue;
1480                    }
1481                    // nonexistence label will be ignored
1482                    if let Some(column_name) = Self::find_case_sensitive_column(input_schema, label)
1483                    {
1484                        exprs.push(DfExpr::Column(Column::from_name(column_name.clone())));
1485
1486                        if update_ctx {
1487                            // update the tag columns in context
1488                            self.ctx.tag_columns.push(column_name);
1489                        }
1490                    }
1491                }
1492                // add timestamp column
1493                exprs.push(self.create_time_index_column_expr()?);
1494
1495                Ok(exprs)
1496            }
1497            Some(LabelModifier::Exclude(labels)) => {
1498                let mut all_fields = input_schema
1499                    .fields()
1500                    .iter()
1501                    .map(|f| f.name())
1502                    .collect::<BTreeSet<_>>();
1503
1504                // Exclude metric engine internal columns (not PromQL labels) from the implicit
1505                // "without" label set.
1506                all_fields.retain(|col| !is_metric_engine_internal_column(col.as_str()));
1507
1508                // remove "without"-ed fields
1509                // nonexistence label will be ignored
1510                for label in &labels.labels {
1511                    let _ = all_fields.remove(label);
1512                }
1513
1514                // remove time index and value fields
1515                if let Some(time_index) = &self.ctx.time_index_column {
1516                    let _ = all_fields.remove(time_index);
1517                }
1518                for value in &self.ctx.field_columns {
1519                    let _ = all_fields.remove(value);
1520                }
1521
1522                if update_ctx {
1523                    // change the tag columns in context
1524                    self.ctx.tag_columns = all_fields.iter().map(|col| (*col).clone()).collect();
1525                }
1526
1527                // collect remaining fields and convert to col expr
1528                let mut exprs = all_fields
1529                    .into_iter()
1530                    .map(|c| DfExpr::Column(Column::from(c)))
1531                    .collect::<Vec<_>>();
1532
1533                // add timestamp column
1534                exprs.push(self.create_time_index_column_expr()?);
1535
1536                Ok(exprs)
1537            }
1538        }
1539    }
1540
1541    // TODO(ruihang): ignore `MetricNameLabel` (`__name__`) matcher
1542    pub fn matchers_to_expr(
1543        label_matchers: Matchers,
1544        table_schema: &DFSchemaRef,
1545    ) -> Result<Vec<DfExpr>> {
1546        let mut exprs = Vec::with_capacity(label_matchers.matchers.len());
1547        for matcher in label_matchers.matchers {
1548            if matcher.name == SCHEMA_COLUMN_MATCHER
1549                || matcher.name == DB_COLUMN_MATCHER
1550                || matcher.name == FIELD_COLUMN_MATCHER
1551            {
1552                continue;
1553            }
1554
1555            let column_name = Self::find_case_sensitive_column(table_schema, matcher.name.as_str());
1556            let col = if let Some(column_name) = column_name {
1557                DfExpr::Column(Column::from_name(column_name))
1558            } else {
1559                DfExpr::Literal(ScalarValue::Utf8(Some(String::new())), None)
1560                    .alias(matcher.name.clone())
1561            };
1562            let lit = DfExpr::Literal(ScalarValue::Utf8(Some(matcher.value)), None);
1563            let expr = match matcher.op {
1564                MatchOp::Equal => col.eq(lit),
1565                MatchOp::NotEqual => col.not_eq(lit),
1566                MatchOp::Re(re) => {
1567                    // TODO(ruihang): a more programmatic way to handle this in datafusion
1568
1569                    // This is a hack to handle `.+` and `.*`, and is not strictly correct
1570                    // `.` doesn't match newline (`\n`). Given this is in PromQL context,
1571                    // most of the time it's fine.
1572                    if re.as_str() == "^(?:.*)$" {
1573                        continue;
1574                    }
1575                    if re.as_str() == "^(?:.+)$" {
1576                        col.not_eq(DfExpr::Literal(
1577                            ScalarValue::Utf8(Some(String::new())),
1578                            None,
1579                        ))
1580                    } else {
1581                        DfExpr::BinaryExpr(BinaryExpr {
1582                            left: Box::new(col),
1583                            op: Operator::RegexMatch,
1584                            right: Box::new(DfExpr::Literal(
1585                                ScalarValue::Utf8(Some(re.as_str().to_string())),
1586                                None,
1587                            )),
1588                        })
1589                    }
1590                }
1591                MatchOp::NotRe(re) => {
1592                    if re.as_str() == "^(?:.*)$" {
1593                        DfExpr::Literal(ScalarValue::Boolean(Some(false)), None)
1594                    } else if re.as_str() == "^(?:.+)$" {
1595                        col.eq(DfExpr::Literal(
1596                            ScalarValue::Utf8(Some(String::new())),
1597                            None,
1598                        ))
1599                    } else {
1600                        DfExpr::BinaryExpr(BinaryExpr {
1601                            left: Box::new(col),
1602                            op: Operator::RegexNotMatch,
1603                            right: Box::new(DfExpr::Literal(
1604                                ScalarValue::Utf8(Some(re.as_str().to_string())),
1605                                None,
1606                            )),
1607                        })
1608                    }
1609                }
1610            };
1611            exprs.push(expr);
1612        }
1613
1614        Ok(exprs)
1615    }
1616
1617    fn find_case_sensitive_column(schema: &DFSchemaRef, column: &str) -> Option<String> {
1618        if is_metric_engine_internal_column(column) {
1619            return None;
1620        }
1621        schema
1622            .fields()
1623            .iter()
1624            .find(|field| field.name() == column)
1625            .map(|field| field.name().clone())
1626    }
1627
1628    fn table_from_source(&self, source: &Arc<dyn TableSource>) -> Result<table::TableRef> {
1629        Ok(source
1630            .as_any()
1631            .downcast_ref::<DefaultTableSource>()
1632            .context(UnknownTableSnafu)?
1633            .table_provider
1634            .as_any()
1635            .downcast_ref::<DfTableProviderAdapter>()
1636            .context(UnknownTableSnafu)?
1637            .table())
1638    }
1639
1640    fn table_ref(&self) -> Result<TableReference> {
1641        let table_name = self
1642            .ctx
1643            .table_name
1644            .clone()
1645            .context(TableNameNotFoundSnafu)?;
1646
1647        // set schema name if `__schema__` is given
1648        let table_ref = if let Some(schema_name) = &self.ctx.schema_name {
1649            TableReference::partial(schema_name.as_str(), table_name.as_str())
1650        } else {
1651            TableReference::bare(table_name.as_str())
1652        };
1653
1654        Ok(table_ref)
1655    }
1656
1657    fn build_time_index_filter(&self, offset_duration: i64) -> Result<Option<DfExpr>> {
1658        let start = self.ctx.start;
1659        let end = self.ctx.end;
1660        if end < start {
1661            return InvalidTimeRangeSnafu { start, end }.fail();
1662        }
1663        let lookback_delta = self.ctx.lookback_delta;
1664        let range = self.ctx.range.unwrap_or_default();
1665        let interval = self.ctx.interval;
1666        let time_index_expr = self.create_time_index_column_expr()?;
1667        let num_points = (end - start) / interval;
1668
1669        // Prometheus semantics:
1670        // - Instant selector lookback: (eval_ts - lookback_delta, eval_ts]
1671        // - Range selector:           (eval_ts - range, eval_ts]
1672        //
1673        // So samples positioned exactly at the lower boundary must be excluded. We align the scan
1674        // lower bound with Prometheus by shifting it forward by 1ms (millisecond granularity),
1675        // while still using a `>=` filter.
1676        let selector_window = if range == 0 { lookback_delta } else { range };
1677        let lower_exclusive_adjustment = if selector_window > 0 { 1 } else { 0 };
1678
1679        // Scan a continuous time range
1680        if (end - start) / interval > MAX_SCATTER_POINTS || interval <= INTERVAL_1H {
1681            let single_time_range = time_index_expr
1682                .clone()
1683                .gt_eq(DfExpr::Literal(
1684                    ScalarValue::TimestampMillisecond(
1685                        Some(
1686                            self.ctx.start - offset_duration - selector_window
1687                                + lower_exclusive_adjustment,
1688                        ),
1689                        None,
1690                    ),
1691                    None,
1692                ))
1693                .and(time_index_expr.lt_eq(DfExpr::Literal(
1694                    ScalarValue::TimestampMillisecond(Some(self.ctx.end - offset_duration), None),
1695                    None,
1696                )));
1697            return Ok(Some(single_time_range));
1698        }
1699
1700        // Otherwise scan scatter ranges separately
1701        let mut filters = Vec::with_capacity(num_points as usize + 1);
1702        for timestamp in (start..=end).step_by(interval as usize) {
1703            filters.push(
1704                time_index_expr
1705                    .clone()
1706                    .gt_eq(DfExpr::Literal(
1707                        ScalarValue::TimestampMillisecond(
1708                            Some(
1709                                timestamp - offset_duration - selector_window
1710                                    + lower_exclusive_adjustment,
1711                            ),
1712                            None,
1713                        ),
1714                        None,
1715                    ))
1716                    .and(time_index_expr.clone().lt_eq(DfExpr::Literal(
1717                        ScalarValue::TimestampMillisecond(Some(timestamp - offset_duration), None),
1718                        None,
1719                    ))),
1720            )
1721        }
1722
1723        Ok(filters.into_iter().reduce(DfExpr::or))
1724    }
1725
1726    /// Create a table scan plan and a filter plan with given filter.
1727    ///
1728    /// # Panic
1729    /// If the filter is empty
1730    async fn create_table_scan_plan(&mut self, table_ref: TableReference) -> Result<LogicalPlan> {
1731        let provider = self
1732            .table_provider
1733            .resolve_table(table_ref.clone())
1734            .await
1735            .context(CatalogSnafu)?;
1736
1737        let logical_table = self.table_from_source(&provider)?;
1738
1739        // Try to rewrite the table scan to physical table scan if possible.
1740        let mut maybe_phy_table_ref = table_ref.clone();
1741        let mut scan_provider = provider;
1742        let mut table_id_filter: Option<u32> = None;
1743
1744        // If it's a metric engine logical table, scan its physical table directly and filter by
1745        // `__table_id = logical_table_id` to get access to internal columns like `__tsid`.
1746        if logical_table.table_info().meta.engine == METRIC_ENGINE_NAME
1747            && let Some(physical_table_name) = logical_table
1748                .table_info()
1749                .meta
1750                .options
1751                .extra_options
1752                .get(LOGICAL_TABLE_METADATA_KEY)
1753        {
1754            let physical_table_ref = if let Some(schema_name) = &self.ctx.schema_name {
1755                TableReference::partial(schema_name.as_str(), physical_table_name.as_str())
1756            } else {
1757                TableReference::bare(physical_table_name.as_str())
1758            };
1759
1760            let physical_provider = match self
1761                .table_provider
1762                .resolve_table(physical_table_ref.clone())
1763                .await
1764            {
1765                Ok(provider) => provider,
1766                Err(e) if e.status_code() == StatusCode::TableNotFound => {
1767                    // Fall back to scanning the logical table. It still works, but without
1768                    // `__tsid` optimization.
1769                    scan_provider.clone()
1770                }
1771                Err(e) => return Err(e).context(CatalogSnafu),
1772            };
1773
1774            if !Arc::ptr_eq(&physical_provider, &scan_provider) {
1775                // Only rewrite when internal columns exist in physical schema.
1776                let physical_table = self.table_from_source(&physical_provider)?;
1777
1778                let has_table_id = physical_table
1779                    .schema()
1780                    .column_schema_by_name(DATA_SCHEMA_TABLE_ID_COLUMN_NAME)
1781                    .is_some();
1782                let has_tsid = physical_table
1783                    .schema()
1784                    .column_schema_by_name(DATA_SCHEMA_TSID_COLUMN_NAME)
1785                    .is_some_and(|col| matches!(col.data_type, ConcreteDataType::UInt64(_)));
1786
1787                if has_table_id && has_tsid {
1788                    scan_provider = physical_provider;
1789                    maybe_phy_table_ref = physical_table_ref;
1790                    table_id_filter = Some(logical_table.table_info().ident.table_id);
1791                }
1792            }
1793        }
1794
1795        let scan_table = self.table_from_source(&scan_provider)?;
1796
1797        let use_tsid = table_id_filter.is_some()
1798            && scan_table
1799                .schema()
1800                .column_schema_by_name(DATA_SCHEMA_TSID_COLUMN_NAME)
1801                .is_some_and(|col| matches!(col.data_type, ConcreteDataType::UInt64(_)));
1802        self.ctx.use_tsid = use_tsid;
1803
1804        let all_table_tags = self.ctx.tag_columns.clone();
1805
1806        let scan_tag_columns = if use_tsid {
1807            let mut scan_tags = self.ctx.tag_columns.clone();
1808            for matcher in &self.ctx.selector_matcher {
1809                if is_metric_engine_internal_column(&matcher.name) {
1810                    continue;
1811                }
1812                if all_table_tags.iter().any(|tag| tag == &matcher.name) {
1813                    scan_tags.push(matcher.name.clone());
1814                }
1815            }
1816            scan_tags.sort_unstable();
1817            scan_tags.dedup();
1818            scan_tags
1819        } else {
1820            self.ctx.tag_columns.clone()
1821        };
1822
1823        let is_time_index_ms = scan_table
1824            .schema()
1825            .timestamp_column()
1826            .with_context(|| TimeIndexNotFoundSnafu {
1827                table: maybe_phy_table_ref.to_quoted_string(),
1828            })?
1829            .data_type
1830            == ConcreteDataType::timestamp_millisecond_datatype();
1831
1832        let scan_projection = if table_id_filter.is_some() {
1833            let mut required_columns = HashSet::new();
1834            required_columns.insert(DATA_SCHEMA_TABLE_ID_COLUMN_NAME.to_string());
1835            required_columns.insert(self.ctx.time_index_column.clone().with_context(|| {
1836                TimeIndexNotFoundSnafu {
1837                    table: maybe_phy_table_ref.to_quoted_string(),
1838                }
1839            })?);
1840            for col in &scan_tag_columns {
1841                required_columns.insert(col.clone());
1842            }
1843            for col in &self.ctx.field_columns {
1844                required_columns.insert(col.clone());
1845            }
1846            if use_tsid {
1847                required_columns.insert(DATA_SCHEMA_TSID_COLUMN_NAME.to_string());
1848            }
1849
1850            let arrow_schema = scan_table.schema().arrow_schema().clone();
1851            Some(
1852                arrow_schema
1853                    .fields()
1854                    .iter()
1855                    .enumerate()
1856                    .filter(|(_, field)| required_columns.contains(field.name().as_str()))
1857                    .map(|(idx, _)| idx)
1858                    .collect::<Vec<_>>(),
1859            )
1860        } else {
1861            None
1862        };
1863
1864        let mut scan_plan =
1865            LogicalPlanBuilder::scan(maybe_phy_table_ref.clone(), scan_provider, scan_projection)
1866                .context(DataFusionPlanningSnafu)?
1867                .build()
1868                .context(DataFusionPlanningSnafu)?;
1869
1870        if let Some(table_id) = table_id_filter {
1871            scan_plan = LogicalPlanBuilder::from(scan_plan)
1872                .filter(
1873                    DfExpr::Column(Column::from_name(DATA_SCHEMA_TABLE_ID_COLUMN_NAME))
1874                        .eq(lit(table_id)),
1875                )
1876                .context(DataFusionPlanningSnafu)?
1877                .alias(table_ref.clone()) // rename the relation back to logical table's name after filtering
1878                .context(DataFusionPlanningSnafu)?
1879                .build()
1880                .context(DataFusionPlanningSnafu)?;
1881        }
1882
1883        if !is_time_index_ms {
1884            // cast to ms if time_index not in Millisecond precision
1885            let expr: Vec<_> = self
1886                .create_field_column_exprs()?
1887                .into_iter()
1888                .chain(
1889                    scan_tag_columns
1890                        .iter()
1891                        .map(|tag| DfExpr::Column(Column::from_name(tag))),
1892                )
1893                .chain(self.ctx.use_tsid.then_some(DfExpr::Column(Column::new(
1894                    Some(table_ref.clone()),
1895                    DATA_SCHEMA_TSID_COLUMN_NAME.to_string(),
1896                ))))
1897                .chain(Some(DfExpr::Alias(Alias {
1898                    expr: Box::new(DfExpr::Cast(Cast {
1899                        expr: Box::new(self.create_time_index_column_expr()?),
1900                        data_type: ArrowDataType::Timestamp(ArrowTimeUnit::Millisecond, None),
1901                    })),
1902                    relation: Some(table_ref.clone()),
1903                    name: self
1904                        .ctx
1905                        .time_index_column
1906                        .as_ref()
1907                        .with_context(|| TimeIndexNotFoundSnafu {
1908                            table: table_ref.to_quoted_string(),
1909                        })?
1910                        .clone(),
1911                    metadata: None,
1912                })))
1913                .collect::<Vec<_>>();
1914            scan_plan = LogicalPlanBuilder::from(scan_plan)
1915                .project(expr)
1916                .context(DataFusionPlanningSnafu)?
1917                .build()
1918                .context(DataFusionPlanningSnafu)?;
1919        } else if table_id_filter.is_some() {
1920            // Drop the internal `__table_id` column after filtering.
1921            let project_exprs = self
1922                .create_field_column_exprs()?
1923                .into_iter()
1924                .chain(
1925                    scan_tag_columns
1926                        .iter()
1927                        .map(|tag| DfExpr::Column(Column::from_name(tag))),
1928                )
1929                .chain(
1930                    self.ctx
1931                        .use_tsid
1932                        .then_some(DfExpr::Column(Column::from_name(
1933                            DATA_SCHEMA_TSID_COLUMN_NAME,
1934                        ))),
1935                )
1936                .chain(Some(self.create_time_index_column_expr()?))
1937                .collect::<Vec<_>>();
1938
1939            scan_plan = LogicalPlanBuilder::from(scan_plan)
1940                .project(project_exprs)
1941                .context(DataFusionPlanningSnafu)?
1942                .build()
1943                .context(DataFusionPlanningSnafu)?;
1944        }
1945
1946        let result = LogicalPlanBuilder::from(scan_plan)
1947            .build()
1948            .context(DataFusionPlanningSnafu)?;
1949        Ok(result)
1950    }
1951
1952    fn collect_row_key_tag_columns_from_plan(
1953        &self,
1954        plan: &LogicalPlan,
1955    ) -> Result<BTreeSet<String>> {
1956        fn walk(
1957            planner: &PromPlanner,
1958            plan: &LogicalPlan,
1959            out: &mut BTreeSet<String>,
1960        ) -> Result<()> {
1961            if let LogicalPlan::TableScan(scan) = plan {
1962                let table = planner.table_from_source(&scan.source)?;
1963                for col in table.table_info().meta.row_key_column_names() {
1964                    if col != DATA_SCHEMA_TABLE_ID_COLUMN_NAME
1965                        && col != DATA_SCHEMA_TSID_COLUMN_NAME
1966                        && !is_metric_engine_internal_column(col)
1967                    {
1968                        out.insert(col.clone());
1969                    }
1970                }
1971            }
1972
1973            for input in plan.inputs() {
1974                walk(planner, input, out)?;
1975            }
1976            Ok(())
1977        }
1978
1979        let mut out = BTreeSet::new();
1980        walk(self, plan, &mut out)?;
1981        Ok(out)
1982    }
1983
1984    fn ensure_tag_columns_available(
1985        &self,
1986        plan: LogicalPlan,
1987        required_tags: &BTreeSet<String>,
1988    ) -> Result<LogicalPlan> {
1989        if required_tags.is_empty() {
1990            return Ok(plan);
1991        }
1992
1993        struct Rewriter {
1994            required_tags: BTreeSet<String>,
1995        }
1996
1997        impl TreeNodeRewriter for Rewriter {
1998            type Node = LogicalPlan;
1999
2000            fn f_up(
2001                &mut self,
2002                node: Self::Node,
2003            ) -> datafusion_common::Result<Transformed<Self::Node>> {
2004                match node {
2005                    LogicalPlan::TableScan(scan) => {
2006                        let schema = scan.source.schema();
2007                        let mut projection = match scan.projection.clone() {
2008                            Some(p) => p,
2009                            None => {
2010                                // Scanning all columns already covers required tags.
2011                                return Ok(Transformed::no(LogicalPlan::TableScan(scan)));
2012                            }
2013                        };
2014
2015                        let mut changed = false;
2016                        for tag in &self.required_tags {
2017                            if let Some((idx, _)) = schema
2018                                .fields()
2019                                .iter()
2020                                .enumerate()
2021                                .find(|(_, field)| field.name() == tag)
2022                                && !projection.contains(&idx)
2023                            {
2024                                projection.push(idx);
2025                                changed = true;
2026                            }
2027                        }
2028
2029                        if !changed {
2030                            return Ok(Transformed::no(LogicalPlan::TableScan(scan)));
2031                        }
2032
2033                        projection.sort_unstable();
2034                        projection.dedup();
2035
2036                        let new_scan = TableScan::try_new(
2037                            scan.table_name.clone(),
2038                            scan.source.clone(),
2039                            Some(projection),
2040                            scan.filters,
2041                            scan.fetch,
2042                        )?;
2043                        Ok(Transformed::yes(LogicalPlan::TableScan(new_scan)))
2044                    }
2045                    LogicalPlan::Projection(proj) => {
2046                        let input_schema = proj.input.schema();
2047
2048                        let existing = proj
2049                            .schema
2050                            .fields()
2051                            .iter()
2052                            .map(|f| f.name().as_str())
2053                            .collect::<HashSet<_>>();
2054
2055                        let mut expr = proj.expr.clone();
2056                        let mut has_changed = false;
2057                        for tag in &self.required_tags {
2058                            if existing.contains(tag.as_str()) {
2059                                continue;
2060                            }
2061
2062                            if let Some(idx) = input_schema.index_of_column_by_name(None, tag) {
2063                                expr.push(DfExpr::Column(Column::from(
2064                                    input_schema.qualified_field(idx),
2065                                )));
2066                                has_changed = true;
2067                            }
2068                        }
2069
2070                        if !has_changed {
2071                            return Ok(Transformed::no(LogicalPlan::Projection(proj)));
2072                        }
2073
2074                        let new_proj = Projection::try_new(expr, proj.input)?;
2075                        Ok(Transformed::yes(LogicalPlan::Projection(new_proj)))
2076                    }
2077                    other => Ok(Transformed::no(other)),
2078                }
2079            }
2080        }
2081
2082        let mut rewriter = Rewriter {
2083            required_tags: required_tags.clone(),
2084        };
2085        let rewritten = plan
2086            .rewrite(&mut rewriter)
2087            .context(DataFusionPlanningSnafu)?;
2088        Ok(rewritten.data)
2089    }
2090
2091    fn refresh_tag_columns_from_schema(&mut self, schema: &DFSchemaRef) {
2092        let time_index = self.ctx.time_index_column.as_deref();
2093        let field_columns = self.ctx.field_columns.iter().collect::<HashSet<_>>();
2094
2095        let mut tags = schema
2096            .fields()
2097            .iter()
2098            .map(|f| f.name())
2099            .filter(|name| Some(name.as_str()) != time_index)
2100            .filter(|name| !field_columns.contains(name))
2101            .filter(|name| !is_metric_engine_internal_column(name))
2102            .cloned()
2103            .collect::<Vec<_>>();
2104        tags.sort_unstable();
2105        tags.dedup();
2106        self.ctx.tag_columns = tags;
2107    }
2108
2109    /// Setup [PromPlannerContext]'s state fields.
2110    ///
2111    /// Returns a logical plan for an empty metric.
2112    async fn setup_context(&mut self) -> Result<Option<LogicalPlan>> {
2113        let table_ref = self.table_ref()?;
2114        let source = match self.table_provider.resolve_table(table_ref.clone()).await {
2115            Err(e) if e.status_code() == StatusCode::TableNotFound => {
2116                let plan = self.setup_context_for_empty_metric()?;
2117                return Ok(Some(plan));
2118            }
2119            res => res.context(CatalogSnafu)?,
2120        };
2121        let table = self.table_from_source(&source)?;
2122
2123        // set time index column name
2124        let time_index = table
2125            .schema()
2126            .timestamp_column()
2127            .with_context(|| TimeIndexNotFoundSnafu {
2128                table: table_ref.to_quoted_string(),
2129            })?
2130            .name
2131            .clone();
2132        self.ctx.time_index_column = Some(time_index);
2133
2134        // set values columns
2135        let values = table
2136            .table_info()
2137            .meta
2138            .field_column_names()
2139            .cloned()
2140            .collect();
2141        self.ctx.field_columns = values;
2142
2143        // set primary key (tag) columns
2144        let tags = table
2145            .table_info()
2146            .meta
2147            .row_key_column_names()
2148            .filter(|col| {
2149                // remove metric engine's internal columns
2150                col != &DATA_SCHEMA_TABLE_ID_COLUMN_NAME && col != &DATA_SCHEMA_TSID_COLUMN_NAME
2151            })
2152            .cloned()
2153            .collect();
2154        self.ctx.tag_columns = tags;
2155
2156        self.ctx.use_tsid = false;
2157
2158        Ok(None)
2159    }
2160
2161    /// Setup [PromPlannerContext]'s state fields for a non existent table
2162    /// without any rows.
2163    fn setup_context_for_empty_metric(&mut self) -> Result<LogicalPlan> {
2164        self.ctx.time_index_column = Some(SPECIAL_TIME_FUNCTION.to_string());
2165        self.ctx.reset_table_name_and_schema();
2166        self.ctx.tag_columns = vec![];
2167        self.ctx.field_columns = vec![DEFAULT_FIELD_COLUMN.to_string()];
2168        self.ctx.use_tsid = false;
2169
2170        // The table doesn't have any data, so we set start to 0 and end to -1.
2171        let plan = LogicalPlan::Extension(Extension {
2172            node: Arc::new(
2173                EmptyMetric::new(
2174                    0,
2175                    -1,
2176                    self.ctx.interval,
2177                    SPECIAL_TIME_FUNCTION.to_string(),
2178                    DEFAULT_FIELD_COLUMN.to_string(),
2179                    Some(lit(0.0f64)),
2180                )
2181                .context(DataFusionPlanningSnafu)?,
2182            ),
2183        });
2184        Ok(plan)
2185    }
2186
2187    // TODO(ruihang): insert column expr
2188    fn create_function_args(&self, args: &[Box<PromExpr>]) -> Result<FunctionArgs> {
2189        let mut result = FunctionArgs::default();
2190
2191        for arg in args {
2192            // First try to parse as literal expression (including binary expressions like 100.0 + 3.0)
2193            if let Some(expr) = Self::try_build_literal_expr(arg) {
2194                result.literals.push(expr);
2195            } else {
2196                // If not a literal, treat as vector input
2197                match arg.as_ref() {
2198                    PromExpr::Subquery(_)
2199                    | PromExpr::VectorSelector(_)
2200                    | PromExpr::MatrixSelector(_)
2201                    | PromExpr::Extension(_)
2202                    | PromExpr::Aggregate(_)
2203                    | PromExpr::Paren(_)
2204                    | PromExpr::Call(_)
2205                    | PromExpr::Binary(_)
2206                    | PromExpr::Unary(_) => {
2207                        if result.input.replace(*arg.clone()).is_some() {
2208                            MultipleVectorSnafu { expr: *arg.clone() }.fail()?;
2209                        }
2210                    }
2211
2212                    _ => {
2213                        let expr = Self::get_param_as_literal_expr(&Some(arg.clone()), None, None)?;
2214                        result.literals.push(expr);
2215                    }
2216                }
2217            }
2218        }
2219
2220        Ok(result)
2221    }
2222
2223    /// Creates function expressions for projection and returns the expressions and new tags.
2224    ///
2225    /// # Side Effects
2226    ///
2227    /// This method will update [PromPlannerContext]'s fields and tags if needed.
2228    fn create_function_expr(
2229        &mut self,
2230        func: &Function,
2231        other_input_exprs: Vec<DfExpr>,
2232        query_engine_state: &QueryEngineState,
2233    ) -> Result<(Vec<DfExpr>, Vec<String>)> {
2234        // TODO(ruihang): check function args list
2235        let mut other_input_exprs: VecDeque<DfExpr> = other_input_exprs.into();
2236
2237        // TODO(ruihang): set this according to in-param list
2238        let field_column_pos = 0;
2239        let mut exprs = Vec::with_capacity(self.ctx.field_columns.len());
2240        // New labels after executing the function, e.g. `label_replace` etc.
2241        let mut new_tags = vec![];
2242        let scalar_func = match func.name {
2243            "increase" => ScalarFunc::ExtrapolateUdf(
2244                Arc::new(Increase::scalar_udf()),
2245                self.ctx.range.context(ExpectRangeSelectorSnafu)?,
2246            ),
2247            "rate" => ScalarFunc::ExtrapolateUdf(
2248                Arc::new(Rate::scalar_udf()),
2249                self.ctx.range.context(ExpectRangeSelectorSnafu)?,
2250            ),
2251            "delta" => ScalarFunc::ExtrapolateUdf(
2252                Arc::new(Delta::scalar_udf()),
2253                self.ctx.range.context(ExpectRangeSelectorSnafu)?,
2254            ),
2255            "idelta" => ScalarFunc::Udf(Arc::new(IDelta::<false>::scalar_udf())),
2256            "irate" => ScalarFunc::Udf(Arc::new(IDelta::<true>::scalar_udf())),
2257            "resets" => ScalarFunc::Udf(Arc::new(Resets::scalar_udf())),
2258            "changes" => ScalarFunc::Udf(Arc::new(Changes::scalar_udf())),
2259            "deriv" => ScalarFunc::Udf(Arc::new(Deriv::scalar_udf())),
2260            "avg_over_time" => ScalarFunc::Udf(Arc::new(AvgOverTime::scalar_udf())),
2261            "min_over_time" => ScalarFunc::Udf(Arc::new(MinOverTime::scalar_udf())),
2262            "max_over_time" => ScalarFunc::Udf(Arc::new(MaxOverTime::scalar_udf())),
2263            "sum_over_time" => ScalarFunc::Udf(Arc::new(SumOverTime::scalar_udf())),
2264            "count_over_time" => ScalarFunc::Udf(Arc::new(CountOverTime::scalar_udf())),
2265            "last_over_time" => ScalarFunc::Udf(Arc::new(LastOverTime::scalar_udf())),
2266            "absent_over_time" => ScalarFunc::Udf(Arc::new(AbsentOverTime::scalar_udf())),
2267            "present_over_time" => ScalarFunc::Udf(Arc::new(PresentOverTime::scalar_udf())),
2268            "stddev_over_time" => ScalarFunc::Udf(Arc::new(StddevOverTime::scalar_udf())),
2269            "stdvar_over_time" => ScalarFunc::Udf(Arc::new(StdvarOverTime::scalar_udf())),
2270            "quantile_over_time" => ScalarFunc::Udf(Arc::new(QuantileOverTime::scalar_udf())),
2271            "predict_linear" => {
2272                other_input_exprs[0] = DfExpr::Cast(Cast {
2273                    expr: Box::new(other_input_exprs[0].clone()),
2274                    data_type: ArrowDataType::Int64,
2275                });
2276                ScalarFunc::Udf(Arc::new(PredictLinear::scalar_udf()))
2277            }
2278            "double_exponential_smoothing" | "holt_winters" => {
2279                ScalarFunc::Udf(Arc::new(DoubleExponentialSmoothing::scalar_udf()))
2280            }
2281            "time" => {
2282                exprs.push(build_special_time_expr(
2283                    self.ctx.time_index_column.as_ref().unwrap(),
2284                ));
2285                ScalarFunc::GeneratedExpr
2286            }
2287            "minute" => {
2288                // date_part('minute', time_index)
2289                let expr = self.date_part_on_time_index("minute")?;
2290                exprs.push(expr);
2291                ScalarFunc::GeneratedExpr
2292            }
2293            "hour" => {
2294                // date_part('hour', time_index)
2295                let expr = self.date_part_on_time_index("hour")?;
2296                exprs.push(expr);
2297                ScalarFunc::GeneratedExpr
2298            }
2299            "month" => {
2300                // date_part('month', time_index)
2301                let expr = self.date_part_on_time_index("month")?;
2302                exprs.push(expr);
2303                ScalarFunc::GeneratedExpr
2304            }
2305            "year" => {
2306                // date_part('year', time_index)
2307                let expr = self.date_part_on_time_index("year")?;
2308                exprs.push(expr);
2309                ScalarFunc::GeneratedExpr
2310            }
2311            "day_of_month" => {
2312                // date_part('day', time_index)
2313                let expr = self.date_part_on_time_index("day")?;
2314                exprs.push(expr);
2315                ScalarFunc::GeneratedExpr
2316            }
2317            "day_of_week" => {
2318                // date_part('dow', time_index)
2319                let expr = self.date_part_on_time_index("dow")?;
2320                exprs.push(expr);
2321                ScalarFunc::GeneratedExpr
2322            }
2323            "day_of_year" => {
2324                // date_part('doy', time_index)
2325                let expr = self.date_part_on_time_index("doy")?;
2326                exprs.push(expr);
2327                ScalarFunc::GeneratedExpr
2328            }
2329            "days_in_month" => {
2330                // date_part(
2331                //     'days',
2332                //     (date_trunc('month', <TIME INDEX>::date) + interval '1 month - 1 day')
2333                // );
2334                let day_lit_expr = "day".lit();
2335                let month_lit_expr = "month".lit();
2336                let interval_1month_lit_expr =
2337                    DfExpr::Literal(ScalarValue::IntervalYearMonth(Some(1)), None);
2338                let interval_1day_lit_expr = DfExpr::Literal(
2339                    ScalarValue::IntervalDayTime(Some(IntervalDayTime::new(1, 0))),
2340                    None,
2341                );
2342                let the_1month_minus_1day_expr = DfExpr::BinaryExpr(BinaryExpr {
2343                    left: Box::new(interval_1month_lit_expr),
2344                    op: Operator::Minus,
2345                    right: Box::new(interval_1day_lit_expr),
2346                });
2347                let date_trunc_expr = DfExpr::ScalarFunction(ScalarFunction {
2348                    func: datafusion_functions::datetime::date_trunc(),
2349                    args: vec![month_lit_expr, self.create_time_index_column_expr()?],
2350                });
2351                let date_trunc_plus_interval_expr = DfExpr::BinaryExpr(BinaryExpr {
2352                    left: Box::new(date_trunc_expr),
2353                    op: Operator::Plus,
2354                    right: Box::new(the_1month_minus_1day_expr),
2355                });
2356                let date_part_expr = DfExpr::ScalarFunction(ScalarFunction {
2357                    func: datafusion_functions::datetime::date_part(),
2358                    args: vec![day_lit_expr, date_trunc_plus_interval_expr],
2359                });
2360
2361                exprs.push(date_part_expr);
2362                ScalarFunc::GeneratedExpr
2363            }
2364
2365            "label_join" => {
2366                let (concat_expr, dst_label) = Self::build_concat_labels_expr(
2367                    &mut other_input_exprs,
2368                    &self.ctx,
2369                    query_engine_state,
2370                )?;
2371
2372                // Reserve the current field columns except the `dst_label`.
2373                for value in &self.ctx.field_columns {
2374                    if *value != dst_label {
2375                        let expr = DfExpr::Column(Column::from_name(value));
2376                        exprs.push(expr);
2377                    }
2378                }
2379
2380                // Remove it from tag columns if exists to avoid duplicated column names
2381                self.ctx.tag_columns.retain(|tag| *tag != dst_label);
2382                new_tags.push(dst_label);
2383                // Add the new label expr to evaluate
2384                exprs.push(concat_expr);
2385
2386                ScalarFunc::GeneratedExpr
2387            }
2388            "label_replace" => {
2389                if let Some((replace_expr, dst_label)) = self
2390                    .build_regexp_replace_label_expr(&mut other_input_exprs, query_engine_state)?
2391                {
2392                    // Reserve the current field columns except the `dst_label`.
2393                    for value in &self.ctx.field_columns {
2394                        if *value != dst_label {
2395                            let expr = DfExpr::Column(Column::from_name(value));
2396                            exprs.push(expr);
2397                        }
2398                    }
2399
2400                    ensure!(
2401                        !self.ctx.tag_columns.contains(&dst_label),
2402                        SameLabelSetSnafu
2403                    );
2404                    new_tags.push(dst_label);
2405                    // Add the new label expr to evaluate
2406                    exprs.push(replace_expr);
2407                } else {
2408                    // Keep the current field columns
2409                    for value in &self.ctx.field_columns {
2410                        let expr = DfExpr::Column(Column::from_name(value));
2411                        exprs.push(expr);
2412                    }
2413                }
2414
2415                ScalarFunc::GeneratedExpr
2416            }
2417            "sort" | "sort_desc" | "sort_by_label" | "sort_by_label_desc" | "timestamp" => {
2418                // These functions are not expression but a part of plan,
2419                // they are processed by `prom_call_expr_to_plan`.
2420                for value in &self.ctx.field_columns {
2421                    let expr = DfExpr::Column(Column::from_name(value));
2422                    exprs.push(expr);
2423                }
2424
2425                ScalarFunc::GeneratedExpr
2426            }
2427            "round" => {
2428                if other_input_exprs.is_empty() {
2429                    other_input_exprs.push_front(0.0f64.lit());
2430                }
2431                ScalarFunc::DataFusionUdf(Arc::new(Round::scalar_udf()))
2432            }
2433            "rad" => ScalarFunc::DataFusionBuiltin(datafusion::functions::math::radians()),
2434            "deg" => ScalarFunc::DataFusionBuiltin(datafusion::functions::math::degrees()),
2435            "sgn" => ScalarFunc::DataFusionBuiltin(datafusion::functions::math::signum()),
2436            "pi" => {
2437                // pi functions doesn't accepts any arguments, needs special processing
2438                let fn_expr = DfExpr::ScalarFunction(ScalarFunction {
2439                    func: datafusion::functions::math::pi(),
2440                    args: vec![],
2441                });
2442                exprs.push(fn_expr);
2443
2444                ScalarFunc::GeneratedExpr
2445            }
2446            _ => {
2447                if let Some(f) = query_engine_state
2448                    .session_state()
2449                    .scalar_functions()
2450                    .get(func.name)
2451                {
2452                    ScalarFunc::DataFusionBuiltin(f.clone())
2453                } else if let Some(factory) = query_engine_state.scalar_function(func.name) {
2454                    let func_state = query_engine_state.function_state();
2455                    let query_ctx = self.table_provider.query_ctx();
2456
2457                    ScalarFunc::DataFusionUdf(Arc::new(factory.provide(FunctionContext {
2458                        state: func_state,
2459                        query_ctx: query_ctx.clone(),
2460                    })))
2461                } else if let Some(f) = datafusion_functions::math::functions()
2462                    .iter()
2463                    .find(|f| f.name() == func.name)
2464                {
2465                    ScalarFunc::DataFusionUdf(f.clone())
2466                } else {
2467                    return UnsupportedExprSnafu {
2468                        name: func.name.to_string(),
2469                    }
2470                    .fail();
2471                }
2472            }
2473        };
2474
2475        for value in &self.ctx.field_columns {
2476            let col_expr = DfExpr::Column(Column::from_name(value));
2477
2478            match scalar_func.clone() {
2479                ScalarFunc::DataFusionBuiltin(func) => {
2480                    other_input_exprs.insert(field_column_pos, col_expr);
2481                    let fn_expr = DfExpr::ScalarFunction(ScalarFunction {
2482                        func,
2483                        args: other_input_exprs.clone().into(),
2484                    });
2485                    exprs.push(fn_expr);
2486                    let _ = other_input_exprs.remove(field_column_pos);
2487                }
2488                ScalarFunc::DataFusionUdf(func) => {
2489                    let args = itertools::chain!(
2490                        other_input_exprs.iter().take(field_column_pos).cloned(),
2491                        std::iter::once(col_expr),
2492                        other_input_exprs.iter().skip(field_column_pos).cloned()
2493                    )
2494                    .collect_vec();
2495                    exprs.push(DfExpr::ScalarFunction(ScalarFunction { func, args }))
2496                }
2497                ScalarFunc::Udf(func) => {
2498                    let ts_range_expr = DfExpr::Column(Column::from_name(
2499                        RangeManipulate::build_timestamp_range_name(
2500                            self.ctx.time_index_column.as_ref().unwrap(),
2501                        ),
2502                    ));
2503                    other_input_exprs.insert(field_column_pos, ts_range_expr);
2504                    other_input_exprs.insert(field_column_pos + 1, col_expr);
2505                    let fn_expr = DfExpr::ScalarFunction(ScalarFunction {
2506                        func,
2507                        args: other_input_exprs.clone().into(),
2508                    });
2509                    exprs.push(fn_expr);
2510                    let _ = other_input_exprs.remove(field_column_pos + 1);
2511                    let _ = other_input_exprs.remove(field_column_pos);
2512                }
2513                ScalarFunc::ExtrapolateUdf(func, range_length) => {
2514                    let ts_range_expr = DfExpr::Column(Column::from_name(
2515                        RangeManipulate::build_timestamp_range_name(
2516                            self.ctx.time_index_column.as_ref().unwrap(),
2517                        ),
2518                    ));
2519                    other_input_exprs.insert(field_column_pos, ts_range_expr);
2520                    other_input_exprs.insert(field_column_pos + 1, col_expr);
2521                    other_input_exprs
2522                        .insert(field_column_pos + 2, self.create_time_index_column_expr()?);
2523                    other_input_exprs.push_back(lit(range_length));
2524                    let fn_expr = DfExpr::ScalarFunction(ScalarFunction {
2525                        func,
2526                        args: other_input_exprs.clone().into(),
2527                    });
2528                    exprs.push(fn_expr);
2529                    let _ = other_input_exprs.pop_back();
2530                    let _ = other_input_exprs.remove(field_column_pos + 2);
2531                    let _ = other_input_exprs.remove(field_column_pos + 1);
2532                    let _ = other_input_exprs.remove(field_column_pos);
2533                }
2534                ScalarFunc::GeneratedExpr => {}
2535            }
2536        }
2537
2538        // Update value columns' name, and alias them to remove qualifiers
2539        // For label functions such as `label_join`, `label_replace`, etc.,
2540        // we keep the fields unchanged.
2541        if !matches!(func.name, "label_join" | "label_replace") {
2542            let mut new_field_columns = Vec::with_capacity(exprs.len());
2543
2544            exprs = exprs
2545                .into_iter()
2546                .map(|expr| {
2547                    let display_name = expr.schema_name().to_string();
2548                    new_field_columns.push(display_name.clone());
2549                    Ok(expr.alias(display_name))
2550                })
2551                .collect::<std::result::Result<Vec<_>, _>>()
2552                .context(DataFusionPlanningSnafu)?;
2553
2554            self.ctx.field_columns = new_field_columns;
2555        }
2556
2557        Ok((exprs, new_tags))
2558    }
2559
2560    /// Validate label name according to Prometheus specification.
2561    /// Label names must match the regex: [a-zA-Z_][a-zA-Z0-9_]*
2562    /// Additionally, label names starting with double underscores are reserved for internal use.
2563    fn validate_label_name(label_name: &str) -> Result<()> {
2564        // Check if label name starts with double underscores (reserved)
2565        if label_name.starts_with("__") {
2566            return InvalidDestinationLabelNameSnafu { label_name }.fail();
2567        }
2568        // Check if label name matches the required pattern
2569        if !LABEL_NAME_REGEX.is_match(label_name) {
2570            return InvalidDestinationLabelNameSnafu { label_name }.fail();
2571        }
2572
2573        Ok(())
2574    }
2575
2576    /// Build expr for `label_replace` function
2577    fn build_regexp_replace_label_expr(
2578        &self,
2579        other_input_exprs: &mut VecDeque<DfExpr>,
2580        query_engine_state: &QueryEngineState,
2581    ) -> Result<Option<(DfExpr, String)>> {
2582        // label_replace(vector, dst_label, replacement, src_label, regex)
2583        let dst_label = match other_input_exprs.pop_front() {
2584            Some(DfExpr::Literal(ScalarValue::Utf8(Some(d)), _)) => d,
2585            other => UnexpectedPlanExprSnafu {
2586                desc: format!("expected dst_label string literal, but found {:?}", other),
2587            }
2588            .fail()?,
2589        };
2590
2591        // Validate the destination label name
2592        Self::validate_label_name(&dst_label)?;
2593        let replacement = match other_input_exprs.pop_front() {
2594            Some(DfExpr::Literal(ScalarValue::Utf8(Some(r)), _)) => r,
2595            other => UnexpectedPlanExprSnafu {
2596                desc: format!("expected replacement string literal, but found {:?}", other),
2597            }
2598            .fail()?,
2599        };
2600        let src_label = match other_input_exprs.pop_front() {
2601            Some(DfExpr::Literal(ScalarValue::Utf8(Some(s)), None)) => s,
2602            other => UnexpectedPlanExprSnafu {
2603                desc: format!("expected src_label string literal, but found {:?}", other),
2604            }
2605            .fail()?,
2606        };
2607
2608        let regex = match other_input_exprs.pop_front() {
2609            Some(DfExpr::Literal(ScalarValue::Utf8(Some(r)), None)) => r,
2610            other => UnexpectedPlanExprSnafu {
2611                desc: format!("expected regex string literal, but found {:?}", other),
2612            }
2613            .fail()?,
2614        };
2615
2616        // Validate the regex before using it
2617        // doc: https://prometheus.io/docs/prometheus/latest/querying/functions/#label_replace
2618        regex::Regex::new(&regex).map_err(|_| {
2619            InvalidRegularExpressionSnafu {
2620                regex: regex.clone(),
2621            }
2622            .build()
2623        })?;
2624
2625        // If the src_label exists and regex is empty, keep everything unchanged.
2626        if self.ctx.tag_columns.contains(&src_label) && regex.is_empty() {
2627            return Ok(None);
2628        }
2629
2630        // If the src_label doesn't exists, and
2631        if !self.ctx.tag_columns.contains(&src_label) {
2632            if replacement.is_empty() {
2633                // the replacement is empty, keep everything unchanged.
2634                return Ok(None);
2635            } else {
2636                // the replacement is not empty, always adds dst_label with replacement value.
2637                return Ok(Some((
2638                    // alias literal `replacement` as dst_label
2639                    lit(replacement).alias(&dst_label),
2640                    dst_label,
2641                )));
2642            }
2643        }
2644
2645        // Preprocess the regex:
2646        // https://github.com/prometheus/prometheus/blob/d902abc50d6652ba8fe9a81ff8e5cce936114eba/promql/functions.go#L1575C32-L1575C37
2647        let regex = format!("^(?s:{regex})$");
2648
2649        let session_state = query_engine_state.session_state();
2650        let func = session_state
2651            .scalar_functions()
2652            .get("regexp_replace")
2653            .context(UnsupportedExprSnafu {
2654                name: "regexp_replace",
2655            })?;
2656
2657        // regexp_replace(src_label, regex, replacement)
2658        let args = vec![
2659            if src_label.is_empty() {
2660                DfExpr::Literal(ScalarValue::Utf8(Some(String::new())), None)
2661            } else {
2662                DfExpr::Column(Column::from_name(src_label))
2663            },
2664            DfExpr::Literal(ScalarValue::Utf8(Some(regex)), None),
2665            DfExpr::Literal(ScalarValue::Utf8(Some(replacement)), None),
2666        ];
2667
2668        Ok(Some((
2669            DfExpr::ScalarFunction(ScalarFunction {
2670                func: func.clone(),
2671                args,
2672            })
2673            .alias(&dst_label),
2674            dst_label,
2675        )))
2676    }
2677
2678    /// Build expr for `label_join` function
2679    fn build_concat_labels_expr(
2680        other_input_exprs: &mut VecDeque<DfExpr>,
2681        ctx: &PromPlannerContext,
2682        query_engine_state: &QueryEngineState,
2683    ) -> Result<(DfExpr, String)> {
2684        // label_join(vector, dst_label, separator, src_label_1, src_label_2, ...)
2685
2686        let dst_label = match other_input_exprs.pop_front() {
2687            Some(DfExpr::Literal(ScalarValue::Utf8(Some(d)), _)) => d,
2688            other => UnexpectedPlanExprSnafu {
2689                desc: format!("expected dst_label string literal, but found {:?}", other),
2690            }
2691            .fail()?,
2692        };
2693        let separator = match other_input_exprs.pop_front() {
2694            Some(DfExpr::Literal(ScalarValue::Utf8(Some(d)), _)) => d,
2695            other => UnexpectedPlanExprSnafu {
2696                desc: format!("expected separator string literal, but found {:?}", other),
2697            }
2698            .fail()?,
2699        };
2700
2701        // Create a set of available columns (tag columns + field columns + time index column)
2702        let available_columns: HashSet<&str> = ctx
2703            .tag_columns
2704            .iter()
2705            .chain(ctx.field_columns.iter())
2706            .chain(ctx.time_index_column.as_ref())
2707            .map(|s| s.as_str())
2708            .collect();
2709
2710        let src_labels = other_input_exprs
2711            .iter()
2712            .map(|expr| {
2713                // Cast source label into column or null literal
2714                match expr {
2715                    DfExpr::Literal(ScalarValue::Utf8(Some(label)), None) => {
2716                        if label.is_empty() {
2717                            Ok(DfExpr::Literal(ScalarValue::Null, None))
2718                        } else if available_columns.contains(label.as_str()) {
2719                            // Label exists in the table schema
2720                            Ok(DfExpr::Column(Column::from_name(label)))
2721                        } else {
2722                            // Label doesn't exist, treat as empty string (null)
2723                            Ok(DfExpr::Literal(ScalarValue::Null, None))
2724                        }
2725                    }
2726                    other => UnexpectedPlanExprSnafu {
2727                        desc: format!(
2728                            "expected source label string literal, but found {:?}",
2729                            other
2730                        ),
2731                    }
2732                    .fail(),
2733                }
2734            })
2735            .collect::<Result<Vec<_>>>()?;
2736        ensure!(
2737            !src_labels.is_empty(),
2738            FunctionInvalidArgumentSnafu {
2739                fn_name: "label_join"
2740            }
2741        );
2742
2743        let session_state = query_engine_state.session_state();
2744        let func = session_state
2745            .scalar_functions()
2746            .get("concat_ws")
2747            .context(UnsupportedExprSnafu { name: "concat_ws" })?;
2748
2749        // concat_ws(separator, src_label_1, src_label_2, ...) as dst_label
2750        let mut args = Vec::with_capacity(1 + src_labels.len());
2751        args.push(DfExpr::Literal(ScalarValue::Utf8(Some(separator)), None));
2752        args.extend(src_labels);
2753
2754        Ok((
2755            DfExpr::ScalarFunction(ScalarFunction {
2756                func: func.clone(),
2757                args,
2758            })
2759            .alias(&dst_label),
2760            dst_label,
2761        ))
2762    }
2763
2764    fn create_time_index_column_expr(&self) -> Result<DfExpr> {
2765        Ok(DfExpr::Column(Column::from_name(
2766            self.ctx
2767                .time_index_column
2768                .clone()
2769                .with_context(|| TimeIndexNotFoundSnafu { table: "unknown" })?,
2770        )))
2771    }
2772
2773    fn create_tag_column_exprs(&self) -> Result<Vec<DfExpr>> {
2774        let mut result = Vec::with_capacity(self.ctx.tag_columns.len());
2775        for tag in &self.ctx.tag_columns {
2776            let expr = DfExpr::Column(Column::from_name(tag));
2777            result.push(expr);
2778        }
2779        Ok(result)
2780    }
2781
2782    fn create_field_column_exprs(&self) -> Result<Vec<DfExpr>> {
2783        let mut result = Vec::with_capacity(self.ctx.field_columns.len());
2784        for field in &self.ctx.field_columns {
2785            let expr = DfExpr::Column(Column::from_name(field));
2786            result.push(expr);
2787        }
2788        Ok(result)
2789    }
2790
2791    fn create_tag_and_time_index_column_sort_exprs(&self) -> Result<Vec<SortExpr>> {
2792        let mut result = self
2793            .ctx
2794            .tag_columns
2795            .iter()
2796            .map(|col| DfExpr::Column(Column::from_name(col)).sort(true, true))
2797            .collect::<Vec<_>>();
2798        result.push(self.create_time_index_column_expr()?.sort(true, true));
2799        Ok(result)
2800    }
2801
2802    fn create_field_columns_sort_exprs(&self, asc: bool) -> Vec<SortExpr> {
2803        self.ctx
2804            .field_columns
2805            .iter()
2806            .map(|col| DfExpr::Column(Column::from_name(col)).sort(asc, true))
2807            .collect::<Vec<_>>()
2808    }
2809
2810    fn create_sort_exprs_by_tags(
2811        func: &str,
2812        tags: Vec<DfExpr>,
2813        asc: bool,
2814    ) -> Result<Vec<SortExpr>> {
2815        ensure!(
2816            !tags.is_empty(),
2817            FunctionInvalidArgumentSnafu { fn_name: func }
2818        );
2819
2820        tags.iter()
2821            .map(|col| match col {
2822                DfExpr::Literal(ScalarValue::Utf8(Some(label)), _) => {
2823                    Ok(DfExpr::Column(Column::from_name(label)).sort(asc, false))
2824                }
2825                other => UnexpectedPlanExprSnafu {
2826                    desc: format!("expected label string literal, but found {:?}", other),
2827                }
2828                .fail(),
2829            })
2830            .collect::<Result<Vec<_>>>()
2831    }
2832
2833    fn create_empty_values_filter_expr(&self) -> Result<DfExpr> {
2834        let mut exprs = Vec::with_capacity(self.ctx.field_columns.len());
2835        for value in &self.ctx.field_columns {
2836            let expr = DfExpr::Column(Column::from_name(value)).is_not_null();
2837            exprs.push(expr);
2838        }
2839
2840        // This error context should be computed lazily: the planner may set `ctx.table_name` to
2841        // `None` for derived expressions (e.g. after projecting the LHS of a vector-vector
2842        // comparison filter). Eagerly calling `table_ref()?` here can turn a valid plan into
2843        // a `TableNameNotFound` error even when `conjunction(exprs)` succeeds.
2844        conjunction(exprs).with_context(|| ValueNotFoundSnafu {
2845            table: self
2846                .table_ref()
2847                .map(|t| t.to_quoted_string())
2848                .unwrap_or_else(|_| "unknown".to_string()),
2849        })
2850    }
2851
2852    /// Creates a set of DataFusion `DfExpr::AggregateFunction` expressions for each value column using the specified aggregate function.
2853    ///
2854    /// # Side Effects
2855    ///
2856    /// This method modifies the value columns in the context by replacing them with the new columns
2857    /// created by the aggregate function application.
2858    ///
2859    /// # Returns
2860    ///
2861    /// Returns a tuple of `(aggregate_expressions, previous_field_expressions)` where:
2862    /// - `aggregate_expressions`: Expressions that apply the aggregate function to the original fields
2863    /// - `previous_field_expressions`: Original field expressions before aggregation. This is non-empty
2864    ///   only when the operation is `count_values`, as this operation requires preserving the original
2865    ///   values for grouping.
2866    ///
2867    fn create_aggregate_exprs(
2868        &mut self,
2869        op: TokenType,
2870        param: &Option<Box<PromExpr>>,
2871        input_plan: &LogicalPlan,
2872    ) -> Result<(Vec<DfExpr>, Vec<DfExpr>)> {
2873        let mut non_col_args = Vec::new();
2874        let is_group_agg = op.id() == token::T_GROUP;
2875        if is_group_agg {
2876            ensure!(
2877                self.ctx.field_columns.len() == 1,
2878                MultiFieldsNotSupportedSnafu {
2879                    operator: "group()"
2880                }
2881            );
2882        }
2883        let aggr = match op.id() {
2884            token::T_SUM => sum_udaf(),
2885            token::T_QUANTILE => {
2886                let q =
2887                    Self::get_param_as_literal_expr(param, Some(op), Some(ArrowDataType::Float64))?;
2888                non_col_args.push(q);
2889                quantile_udaf()
2890            }
2891            token::T_AVG => avg_udaf(),
2892            token::T_COUNT_VALUES | token::T_COUNT => count_udaf(),
2893            token::T_MIN => min_udaf(),
2894            token::T_MAX => max_udaf(),
2895            // PromQL's `group()` aggregator produces 1 for each group.
2896            // Use `max(1.0)` (per-group) to match semantics and output type (Float64).
2897            token::T_GROUP => max_udaf(),
2898            token::T_STDDEV => stddev_pop_udaf(),
2899            token::T_STDVAR => var_pop_udaf(),
2900            token::T_TOPK | token::T_BOTTOMK => UnsupportedExprSnafu {
2901                name: format!("{op:?}"),
2902            }
2903            .fail()?,
2904            _ => UnexpectedTokenSnafu { token: op }.fail()?,
2905        };
2906
2907        // perform aggregate operation to each value column
2908        let exprs: Vec<DfExpr> = self
2909            .ctx
2910            .field_columns
2911            .iter()
2912            .map(|col| {
2913                if is_group_agg {
2914                    aggr.call(vec![lit(1_f64)])
2915                } else {
2916                    non_col_args.push(DfExpr::Column(Column::from_name(col)));
2917                    let expr = aggr.call(non_col_args.clone());
2918                    non_col_args.pop();
2919                    expr
2920                }
2921            })
2922            .collect::<Vec<_>>();
2923
2924        // if the aggregator is `count_values`, it must be grouped by current fields.
2925        let prev_field_exprs = if op.id() == token::T_COUNT_VALUES {
2926            let prev_field_exprs: Vec<_> = self
2927                .ctx
2928                .field_columns
2929                .iter()
2930                .map(|col| DfExpr::Column(Column::from_name(col)))
2931                .collect();
2932
2933            ensure!(
2934                self.ctx.field_columns.len() == 1,
2935                UnsupportedExprSnafu {
2936                    name: "count_values on multi-value input"
2937                }
2938            );
2939
2940            prev_field_exprs
2941        } else {
2942            vec![]
2943        };
2944
2945        // update value column name according to the aggregators,
2946        let mut new_field_columns = Vec::with_capacity(self.ctx.field_columns.len());
2947
2948        let normalized_exprs =
2949            normalize_cols(exprs.iter().cloned(), input_plan).context(DataFusionPlanningSnafu)?;
2950        for expr in normalized_exprs {
2951            new_field_columns.push(expr.schema_name().to_string());
2952        }
2953        self.ctx.field_columns = new_field_columns;
2954
2955        Ok((exprs, prev_field_exprs))
2956    }
2957
2958    fn get_param_value_as_str(op: TokenType, param: &Option<Box<PromExpr>>) -> Result<&str> {
2959        let param = param
2960            .as_deref()
2961            .with_context(|| FunctionInvalidArgumentSnafu {
2962                fn_name: op.to_string(),
2963            })?;
2964        let PromExpr::StringLiteral(StringLiteral { val }) = param else {
2965            return FunctionInvalidArgumentSnafu {
2966                fn_name: op.to_string(),
2967            }
2968            .fail();
2969        };
2970
2971        Ok(val)
2972    }
2973
2974    fn get_param_as_literal_expr(
2975        param: &Option<Box<PromExpr>>,
2976        op: Option<TokenType>,
2977        expected_type: Option<ArrowDataType>,
2978    ) -> Result<DfExpr> {
2979        let prom_param = param.as_deref().with_context(|| {
2980            if let Some(op) = op {
2981                FunctionInvalidArgumentSnafu {
2982                    fn_name: op.to_string(),
2983                }
2984            } else {
2985                FunctionInvalidArgumentSnafu {
2986                    fn_name: "unknown".to_string(),
2987                }
2988            }
2989        })?;
2990
2991        let expr = Self::try_build_literal_expr(prom_param).with_context(|| {
2992            if let Some(op) = op {
2993                FunctionInvalidArgumentSnafu {
2994                    fn_name: op.to_string(),
2995                }
2996            } else {
2997                FunctionInvalidArgumentSnafu {
2998                    fn_name: "unknown".to_string(),
2999                }
3000            }
3001        })?;
3002
3003        // check if the type is expected
3004        if let Some(expected_type) = expected_type {
3005            // literal should not have reference to column
3006            let expr_type = expr
3007                .get_type(&DFSchema::empty())
3008                .context(DataFusionPlanningSnafu)?;
3009            if expected_type != expr_type {
3010                return FunctionInvalidArgumentSnafu {
3011                    fn_name: format!("expected {expected_type:?}, but found {expr_type:?}"),
3012                }
3013                .fail();
3014            }
3015        }
3016
3017        Ok(expr)
3018    }
3019
3020    /// Create [DfExpr::WindowFunction] expr for each value column with given window function.
3021    ///
3022    fn create_window_exprs(
3023        &mut self,
3024        op: TokenType,
3025        group_exprs: Vec<DfExpr>,
3026        input_plan: &LogicalPlan,
3027    ) -> Result<Vec<DfExpr>> {
3028        ensure!(
3029            self.ctx.field_columns.len() == 1,
3030            UnsupportedExprSnafu {
3031                name: "topk or bottomk on multi-value input"
3032            }
3033        );
3034
3035        assert!(matches!(op.id(), token::T_TOPK | token::T_BOTTOMK));
3036
3037        let asc = matches!(op.id(), token::T_BOTTOMK);
3038
3039        let tag_sort_exprs = self
3040            .create_tag_column_exprs()?
3041            .into_iter()
3042            .map(|expr| expr.sort(asc, true));
3043
3044        // perform window operation to each value column
3045        let exprs: Vec<DfExpr> = self
3046            .ctx
3047            .field_columns
3048            .iter()
3049            .map(|col| {
3050                let mut sort_exprs = Vec::with_capacity(self.ctx.tag_columns.len() + 1);
3051                // Order by value in the specific order
3052                sort_exprs.push(DfExpr::Column(Column::from(col)).sort(asc, true));
3053                // Then tags if the values are equal,
3054                // Try to ensure the relative stability of the output results.
3055                sort_exprs.extend(tag_sort_exprs.clone());
3056
3057                DfExpr::WindowFunction(Box::new(WindowFunction {
3058                    fun: WindowFunctionDefinition::WindowUDF(Arc::new(RowNumber::new().into())),
3059                    params: WindowFunctionParams {
3060                        args: vec![],
3061                        partition_by: group_exprs.clone(),
3062                        order_by: sort_exprs,
3063                        window_frame: WindowFrame::new(Some(true)),
3064                        null_treatment: None,
3065                        distinct: false,
3066                        filter: None,
3067                    },
3068                }))
3069            })
3070            .collect();
3071
3072        let normalized_exprs =
3073            normalize_cols(exprs.iter().cloned(), input_plan).context(DataFusionPlanningSnafu)?;
3074        Ok(normalized_exprs)
3075    }
3076
3077    /// Try to build a [f64] from [PromExpr].
3078    #[deprecated(
3079        note = "use `Self::get_param_as_literal_expr` instead. This is only for `create_histogram_plan`"
3080    )]
3081    fn try_build_float_literal(expr: &PromExpr) -> Option<f64> {
3082        match expr {
3083            PromExpr::NumberLiteral(NumberLiteral { val }) => Some(*val),
3084            PromExpr::Paren(ParenExpr { expr }) => Self::try_build_float_literal(expr),
3085            PromExpr::Unary(UnaryExpr { expr, .. }) => {
3086                Self::try_build_float_literal(expr).map(|f| -f)
3087            }
3088            PromExpr::StringLiteral(_)
3089            | PromExpr::Binary(_)
3090            | PromExpr::VectorSelector(_)
3091            | PromExpr::MatrixSelector(_)
3092            | PromExpr::Call(_)
3093            | PromExpr::Extension(_)
3094            | PromExpr::Aggregate(_)
3095            | PromExpr::Subquery(_) => None,
3096        }
3097    }
3098
3099    /// Create a [SPECIAL_HISTOGRAM_QUANTILE] plan.
3100    async fn create_histogram_plan(
3101        &mut self,
3102        args: &PromFunctionArgs,
3103        query_engine_state: &QueryEngineState,
3104    ) -> Result<LogicalPlan> {
3105        if args.args.len() != 2 {
3106            return FunctionInvalidArgumentSnafu {
3107                fn_name: SPECIAL_HISTOGRAM_QUANTILE.to_string(),
3108            }
3109            .fail();
3110        }
3111        #[allow(deprecated)]
3112        let phi = Self::try_build_float_literal(&args.args[0]).with_context(|| {
3113            FunctionInvalidArgumentSnafu {
3114                fn_name: SPECIAL_HISTOGRAM_QUANTILE.to_string(),
3115            }
3116        })?;
3117
3118        let input = args.args[1].as_ref().clone();
3119        let input_plan = self.prom_expr_to_plan(&input, query_engine_state).await?;
3120        // `histogram_quantile` folds buckets across `le`, so `__tsid` (which includes `le`) is not
3121        // a stable series identifier anymore. Also, HistogramFold infers label columns from the
3122        // input schema and must not treat `__tsid` as a label column.
3123        let input_plan = self.strip_tsid_column(input_plan)?;
3124        self.ctx.use_tsid = false;
3125
3126        if !self.ctx.has_le_tag() {
3127            // Return empty result instead of error when 'le' column is not found
3128            // This handles the case when histogram metrics don't exist
3129            return Ok(LogicalPlan::EmptyRelation(
3130                datafusion::logical_expr::EmptyRelation {
3131                    produce_one_row: false,
3132                    schema: Arc::new(DFSchema::empty()),
3133                },
3134            ));
3135        }
3136        let time_index_column =
3137            self.ctx
3138                .time_index_column
3139                .clone()
3140                .with_context(|| TimeIndexNotFoundSnafu {
3141                    table: self.ctx.table_name.clone().unwrap_or_default(),
3142                })?;
3143        // FIXME(ruihang): support multi fields
3144        let field_column = self
3145            .ctx
3146            .field_columns
3147            .first()
3148            .with_context(|| FunctionInvalidArgumentSnafu {
3149                fn_name: SPECIAL_HISTOGRAM_QUANTILE.to_string(),
3150            })?
3151            .clone();
3152        // remove le column from tag columns
3153        self.ctx.tag_columns.retain(|col| col != LE_COLUMN_NAME);
3154
3155        Ok(LogicalPlan::Extension(Extension {
3156            node: Arc::new(
3157                HistogramFold::new(
3158                    LE_COLUMN_NAME.to_string(),
3159                    field_column,
3160                    time_index_column,
3161                    phi,
3162                    input_plan,
3163                )
3164                .context(DataFusionPlanningSnafu)?,
3165            ),
3166        }))
3167    }
3168
3169    /// Create a [SPECIAL_VECTOR_FUNCTION] plan
3170    async fn create_vector_plan(&mut self, args: &PromFunctionArgs) -> Result<LogicalPlan> {
3171        if args.args.len() != 1 {
3172            return FunctionInvalidArgumentSnafu {
3173                fn_name: SPECIAL_VECTOR_FUNCTION.to_string(),
3174            }
3175            .fail();
3176        }
3177        let lit = Self::get_param_as_literal_expr(&Some(args.args[0].clone()), None, None)?;
3178
3179        // reuse `SPECIAL_TIME_FUNCTION` as name of time index column
3180        self.ctx.time_index_column = Some(SPECIAL_TIME_FUNCTION.to_string());
3181        self.ctx.reset_table_name_and_schema();
3182        self.ctx.tag_columns = vec![];
3183        self.ctx.field_columns = vec![greptime_value().to_string()];
3184        Ok(LogicalPlan::Extension(Extension {
3185            node: Arc::new(
3186                EmptyMetric::new(
3187                    self.ctx.start,
3188                    self.ctx.end,
3189                    self.ctx.interval,
3190                    SPECIAL_TIME_FUNCTION.to_string(),
3191                    greptime_value().to_string(),
3192                    Some(lit),
3193                )
3194                .context(DataFusionPlanningSnafu)?,
3195            ),
3196        }))
3197    }
3198
3199    /// Create a [SCALAR_FUNCTION] plan
3200    async fn create_scalar_plan(
3201        &mut self,
3202        args: &PromFunctionArgs,
3203        query_engine_state: &QueryEngineState,
3204    ) -> Result<LogicalPlan> {
3205        ensure!(
3206            args.len() == 1,
3207            FunctionInvalidArgumentSnafu {
3208                fn_name: SCALAR_FUNCTION
3209            }
3210        );
3211        let input = self
3212            .prom_expr_to_plan(&args.args[0], query_engine_state)
3213            .await?;
3214        ensure!(
3215            self.ctx.field_columns.len() == 1,
3216            MultiFieldsNotSupportedSnafu {
3217                operator: SCALAR_FUNCTION
3218            },
3219        );
3220        let scalar_plan = LogicalPlan::Extension(Extension {
3221            node: Arc::new(
3222                ScalarCalculate::new(
3223                    self.ctx.start,
3224                    self.ctx.end,
3225                    self.ctx.interval,
3226                    input,
3227                    self.ctx.time_index_column.as_ref().unwrap(),
3228                    &self.ctx.tag_columns,
3229                    &self.ctx.field_columns[0],
3230                    self.ctx.table_name.as_deref(),
3231                )
3232                .context(PromqlPlanNodeSnafu)?,
3233            ),
3234        });
3235        // scalar plan have no tag columns
3236        self.ctx.tag_columns.clear();
3237        self.ctx.field_columns.clear();
3238        self.ctx
3239            .field_columns
3240            .push(scalar_plan.schema().field(1).name().clone());
3241        Ok(scalar_plan)
3242    }
3243
3244    /// Create a [SPECIAL_ABSENT_FUNCTION] plan
3245    async fn create_absent_plan(
3246        &mut self,
3247        args: &PromFunctionArgs,
3248        query_engine_state: &QueryEngineState,
3249    ) -> Result<LogicalPlan> {
3250        if args.args.len() != 1 {
3251            return FunctionInvalidArgumentSnafu {
3252                fn_name: SPECIAL_ABSENT_FUNCTION.to_string(),
3253            }
3254            .fail();
3255        }
3256        let input = self
3257            .prom_expr_to_plan(&args.args[0], query_engine_state)
3258            .await?;
3259
3260        let time_index_expr = self.create_time_index_column_expr()?;
3261        let first_field_expr =
3262            self.create_field_column_exprs()?
3263                .pop()
3264                .with_context(|| ValueNotFoundSnafu {
3265                    table: self.ctx.table_name.clone().unwrap_or_default(),
3266                })?;
3267        let first_value_expr = first_value(first_field_expr, vec![]);
3268
3269        let ordered_aggregated_input = LogicalPlanBuilder::from(input)
3270            .aggregate(
3271                vec![time_index_expr.clone()],
3272                vec![first_value_expr.clone()],
3273            )
3274            .context(DataFusionPlanningSnafu)?
3275            .sort(vec![time_index_expr.sort(true, false)])
3276            .context(DataFusionPlanningSnafu)?
3277            .build()
3278            .context(DataFusionPlanningSnafu)?;
3279
3280        let fake_labels = self
3281            .ctx
3282            .selector_matcher
3283            .iter()
3284            .filter_map(|matcher| match matcher.op {
3285                MatchOp::Equal => Some((matcher.name.clone(), matcher.value.clone())),
3286                _ => None,
3287            })
3288            .collect::<Vec<_>>();
3289
3290        // Create the absent plan
3291        let absent_plan = LogicalPlan::Extension(Extension {
3292            node: Arc::new(
3293                Absent::try_new(
3294                    self.ctx.start,
3295                    self.ctx.end,
3296                    self.ctx.interval,
3297                    self.ctx.time_index_column.as_ref().unwrap().clone(),
3298                    self.ctx.field_columns[0].clone(),
3299                    fake_labels,
3300                    ordered_aggregated_input,
3301                )
3302                .context(DataFusionPlanningSnafu)?,
3303            ),
3304        });
3305
3306        Ok(absent_plan)
3307    }
3308
3309    /// Try to build a DataFusion Literal Expression from PromQL Expr, return
3310    /// `None` if the input is not a literal expression.
3311    fn try_build_literal_expr(expr: &PromExpr) -> Option<DfExpr> {
3312        match expr {
3313            PromExpr::NumberLiteral(NumberLiteral { val }) => Some(val.lit()),
3314            PromExpr::StringLiteral(StringLiteral { val }) => Some(val.lit()),
3315            PromExpr::VectorSelector(_)
3316            | PromExpr::MatrixSelector(_)
3317            | PromExpr::Extension(_)
3318            | PromExpr::Aggregate(_)
3319            | PromExpr::Subquery(_) => None,
3320            PromExpr::Call(Call { func, .. }) => {
3321                if func.name == SPECIAL_TIME_FUNCTION {
3322                    // For time() function, don't treat it as a literal
3323                    // Let it be handled as a regular function call
3324                    None
3325                } else {
3326                    None
3327                }
3328            }
3329            PromExpr::Paren(ParenExpr { expr }) => Self::try_build_literal_expr(expr),
3330            // TODO(ruihang): support Unary operator
3331            PromExpr::Unary(UnaryExpr { expr, .. }) => Self::try_build_literal_expr(expr),
3332            PromExpr::Binary(PromBinaryExpr {
3333                lhs,
3334                rhs,
3335                op,
3336                modifier,
3337            }) => {
3338                let lhs = Self::try_build_literal_expr(lhs)?;
3339                let rhs = Self::try_build_literal_expr(rhs)?;
3340                let is_comparison_op = Self::is_token_a_comparison_op(*op);
3341                let expr_builder = Self::prom_token_to_binary_expr_builder(*op).ok()?;
3342                let expr = expr_builder(lhs, rhs).ok()?;
3343
3344                let should_return_bool = if let Some(m) = modifier {
3345                    m.return_bool
3346                } else {
3347                    false
3348                };
3349                if is_comparison_op && should_return_bool {
3350                    Some(DfExpr::Cast(Cast {
3351                        expr: Box::new(expr),
3352                        data_type: ArrowDataType::Float64,
3353                    }))
3354                } else {
3355                    Some(expr)
3356                }
3357            }
3358        }
3359    }
3360
3361    fn try_build_special_time_expr_with_context(&self, expr: &PromExpr) -> Option<DfExpr> {
3362        match expr {
3363            PromExpr::Call(Call { func, .. }) => {
3364                if func.name == SPECIAL_TIME_FUNCTION
3365                    && let Some(time_index_col) = self.ctx.time_index_column.as_ref()
3366                {
3367                    Some(build_special_time_expr(time_index_col))
3368                } else {
3369                    None
3370                }
3371            }
3372            _ => None,
3373        }
3374    }
3375
3376    /// Return a lambda to build binary expression from token.
3377    /// Because some binary operator are function in DataFusion like `atan2` or `^`.
3378    #[allow(clippy::type_complexity)]
3379    fn prom_token_to_binary_expr_builder(
3380        token: TokenType,
3381    ) -> Result<Box<dyn Fn(DfExpr, DfExpr) -> Result<DfExpr>>> {
3382        let cast_float = |expr| {
3383            if matches!(
3384                &expr,
3385                DfExpr::Cast(Cast {
3386                    data_type: ArrowDataType::Float64,
3387                    ..
3388                })
3389            ) || matches!(&expr, DfExpr::Literal(ScalarValue::Float64(_), _))
3390            {
3391                expr
3392            } else {
3393                DfExpr::Cast(Cast {
3394                    expr: Box::new(expr),
3395                    data_type: ArrowDataType::Float64,
3396                })
3397            }
3398        };
3399        match token.id() {
3400            token::T_ADD => Ok(Box::new(move |lhs, rhs| {
3401                Ok(cast_float(lhs) + cast_float(rhs))
3402            })),
3403            token::T_SUB => Ok(Box::new(move |lhs, rhs| {
3404                Ok(cast_float(lhs) - cast_float(rhs))
3405            })),
3406            token::T_MUL => Ok(Box::new(move |lhs, rhs| {
3407                Ok(cast_float(lhs) * cast_float(rhs))
3408            })),
3409            token::T_DIV => Ok(Box::new(move |lhs, rhs| {
3410                Ok(cast_float(lhs) / cast_float(rhs))
3411            })),
3412            token::T_MOD => Ok(Box::new(move |lhs: DfExpr, rhs| {
3413                Ok(cast_float(lhs) % cast_float(rhs))
3414            })),
3415            token::T_EQLC => Ok(Box::new(|lhs, rhs| Ok(lhs.eq(rhs)))),
3416            token::T_NEQ => Ok(Box::new(|lhs, rhs| Ok(lhs.not_eq(rhs)))),
3417            token::T_GTR => Ok(Box::new(|lhs, rhs| Ok(lhs.gt(rhs)))),
3418            token::T_LSS => Ok(Box::new(|lhs, rhs| Ok(lhs.lt(rhs)))),
3419            token::T_GTE => Ok(Box::new(|lhs, rhs| Ok(lhs.gt_eq(rhs)))),
3420            token::T_LTE => Ok(Box::new(|lhs, rhs| Ok(lhs.lt_eq(rhs)))),
3421            token::T_POW => Ok(Box::new(move |lhs, rhs| {
3422                Ok(DfExpr::ScalarFunction(ScalarFunction {
3423                    func: datafusion_functions::math::power(),
3424                    args: vec![cast_float(lhs), cast_float(rhs)],
3425                }))
3426            })),
3427            token::T_ATAN2 => Ok(Box::new(move |lhs, rhs| {
3428                Ok(DfExpr::ScalarFunction(ScalarFunction {
3429                    func: datafusion_functions::math::atan2(),
3430                    args: vec![cast_float(lhs), cast_float(rhs)],
3431                }))
3432            })),
3433            _ => UnexpectedTokenSnafu { token }.fail(),
3434        }
3435    }
3436
3437    /// Check if the given op is a [comparison operator](https://prometheus.io/docs/prometheus/latest/querying/operators/#comparison-binary-operators).
3438    fn is_token_a_comparison_op(token: TokenType) -> bool {
3439        matches!(
3440            token.id(),
3441            token::T_EQLC
3442                | token::T_NEQ
3443                | token::T_GTR
3444                | token::T_LSS
3445                | token::T_GTE
3446                | token::T_LTE
3447        )
3448    }
3449
3450    /// Check if the given op is a set operator (UNION, INTERSECT and EXCEPT in SQL).
3451    fn is_token_a_set_op(token: TokenType) -> bool {
3452        matches!(
3453            token.id(),
3454            token::T_LAND // INTERSECT
3455                | token::T_LOR // UNION
3456                | token::T_LUNLESS // EXCEPT
3457        )
3458    }
3459
3460    fn align_binary_field_columns<'a>(
3461        left_field_columns: &'a [String],
3462        right_field_columns: &'a [String],
3463    ) -> (Vec<String>, Vec<(&'a String, &'a String)>) {
3464        let field_pairs = left_field_columns
3465            .iter()
3466            .zip(right_field_columns.iter())
3467            .collect::<Vec<_>>();
3468        let output_field_columns = field_pairs
3469            .iter()
3470            .map(|(left_col_name, _)| (*left_col_name).clone())
3471            .collect();
3472        (output_field_columns, field_pairs)
3473    }
3474
3475    fn plan_has_tsid_column(plan: &LogicalPlan) -> bool {
3476        plan.schema()
3477            .fields()
3478            .iter()
3479            .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME)
3480    }
3481
3482    fn optional_tsid_projection(
3483        schema: &DFSchemaRef,
3484        table_ref: Option<&TableReference>,
3485        keep_tsid: bool,
3486    ) -> Option<DfExpr> {
3487        keep_tsid.then_some(()).and_then(|_| {
3488            schema
3489                .qualified_field_with_name(table_ref, DATA_SCHEMA_TSID_COLUMN_NAME)
3490                .ok()
3491                .map(|field| DfExpr::Column(field.into()))
3492        })
3493    }
3494
3495    fn binary_join_key_columns(
3496        &self,
3497        left: &LogicalPlan,
3498        right: &LogicalPlan,
3499        only_join_time_index: bool,
3500        modifier: &Option<BinModifier>,
3501    ) -> (BTreeSet<String>, BTreeSet<String>) {
3502        let use_tsid_join = !only_join_time_index
3503            && modifier.as_ref().is_none_or(|modifier| {
3504                modifier.matching.is_none()
3505                    && matches!(modifier.card, VectorMatchCardinality::OneToOne)
3506            })
3507            && Self::plan_has_tsid_column(left)
3508            && Self::plan_has_tsid_column(right);
3509
3510        let (mut left_tag_columns, mut right_tag_columns) = if use_tsid_join {
3511            (
3512                BTreeSet::from([DATA_SCHEMA_TSID_COLUMN_NAME.to_string()]),
3513                BTreeSet::from([DATA_SCHEMA_TSID_COLUMN_NAME.to_string()]),
3514            )
3515        } else {
3516            let left_tag_columns = if only_join_time_index {
3517                BTreeSet::new()
3518            } else {
3519                self.ctx
3520                    .tag_columns
3521                    .iter()
3522                    .cloned()
3523                    .collect::<BTreeSet<_>>()
3524            };
3525            let right_tag_columns = left_tag_columns.clone();
3526            (left_tag_columns, right_tag_columns)
3527        };
3528
3529        if !use_tsid_join
3530            && let Some(modifier) = modifier
3531            && let Some(matching) = &modifier.matching
3532        {
3533            match matching {
3534                LabelModifier::Include(on) => {
3535                    let mask = on.labels.iter().cloned().collect::<BTreeSet<_>>();
3536                    left_tag_columns = left_tag_columns.intersection(&mask).cloned().collect();
3537                    right_tag_columns = right_tag_columns.intersection(&mask).cloned().collect();
3538                }
3539                LabelModifier::Exclude(ignoring) => {
3540                    for label in &ignoring.labels {
3541                        let _ = left_tag_columns.remove(label);
3542                        let _ = right_tag_columns.remove(label);
3543                    }
3544                }
3545            }
3546        }
3547
3548        (left_tag_columns, right_tag_columns)
3549    }
3550
3551    /// Build a inner join on time index column and tag columns to concat two logical plans.
3552    /// When `only_join_time_index == true` we only join on the time index, because these two plan may not have the same tag columns
3553    #[allow(clippy::too_many_arguments)]
3554    fn join_on_non_field_columns(
3555        &self,
3556        left: LogicalPlan,
3557        right: LogicalPlan,
3558        left_table_ref: TableReference,
3559        right_table_ref: TableReference,
3560        left_time_index_column: Option<String>,
3561        right_time_index_column: Option<String>,
3562        only_join_time_index: bool,
3563        modifier: &Option<BinModifier>,
3564    ) -> Result<LogicalPlan> {
3565        let (mut left_tag_columns, mut right_tag_columns) =
3566            self.binary_join_key_columns(&left, &right, only_join_time_index, modifier);
3567
3568        // push time index column if it exists
3569        if let (Some(left_time_index_column), Some(right_time_index_column)) =
3570            (left_time_index_column, right_time_index_column)
3571        {
3572            left_tag_columns.insert(left_time_index_column);
3573            right_tag_columns.insert(right_time_index_column);
3574        }
3575
3576        let right = LogicalPlanBuilder::from(right)
3577            .alias(right_table_ref)
3578            .context(DataFusionPlanningSnafu)?
3579            .build()
3580            .context(DataFusionPlanningSnafu)?;
3581
3582        // Inner Join on time index column to concat two operator
3583        LogicalPlanBuilder::from(left)
3584            .alias(left_table_ref)
3585            .context(DataFusionPlanningSnafu)?
3586            .join_detailed(
3587                right,
3588                JoinType::Inner,
3589                (
3590                    left_tag_columns
3591                        .into_iter()
3592                        .map(Column::from_name)
3593                        .collect::<Vec<_>>(),
3594                    right_tag_columns
3595                        .into_iter()
3596                        .map(Column::from_name)
3597                        .collect::<Vec<_>>(),
3598                ),
3599                None,
3600                NullEquality::NullEqualsNull,
3601            )
3602            .context(DataFusionPlanningSnafu)?
3603            .build()
3604            .context(DataFusionPlanningSnafu)
3605    }
3606
3607    /// Build a set operator (AND/OR/UNLESS)
3608    fn set_op_on_non_field_columns(
3609        &mut self,
3610        left: LogicalPlan,
3611        mut right: LogicalPlan,
3612        left_context: PromPlannerContext,
3613        right_context: PromPlannerContext,
3614        op: TokenType,
3615        modifier: &Option<BinModifier>,
3616    ) -> Result<LogicalPlan> {
3617        let mut left_tag_col_set = left_context
3618            .tag_columns
3619            .iter()
3620            .cloned()
3621            .collect::<HashSet<_>>();
3622        let mut right_tag_col_set = right_context
3623            .tag_columns
3624            .iter()
3625            .cloned()
3626            .collect::<HashSet<_>>();
3627
3628        if matches!(op.id(), token::T_LOR) {
3629            return self.or_operator(
3630                left,
3631                right,
3632                left_tag_col_set,
3633                right_tag_col_set,
3634                left_context,
3635                right_context,
3636                modifier,
3637            );
3638        }
3639
3640        // apply modifier
3641        if let Some(modifier) = modifier {
3642            // one-to-many and many-to-one are not supported
3643            ensure!(
3644                matches!(
3645                    modifier.card,
3646                    VectorMatchCardinality::OneToOne | VectorMatchCardinality::ManyToMany
3647                ),
3648                UnsupportedVectorMatchSnafu {
3649                    name: modifier.card.clone(),
3650                },
3651            );
3652            // apply label modifier
3653            if let Some(matching) = &modifier.matching {
3654                match matching {
3655                    // keeps columns mentioned in `on`
3656                    LabelModifier::Include(on) => {
3657                        let mask = on.labels.iter().cloned().collect::<HashSet<_>>();
3658                        left_tag_col_set = left_tag_col_set.intersection(&mask).cloned().collect();
3659                        right_tag_col_set =
3660                            right_tag_col_set.intersection(&mask).cloned().collect();
3661                    }
3662                    // removes columns memtioned in `ignoring`
3663                    LabelModifier::Exclude(ignoring) => {
3664                        // doesn't check existence of label
3665                        for label in &ignoring.labels {
3666                            let _ = left_tag_col_set.remove(label);
3667                            let _ = right_tag_col_set.remove(label);
3668                        }
3669                    }
3670                }
3671            }
3672        }
3673        // ensure two sides have the same tag columns
3674        if !matches!(op.id(), token::T_LOR) {
3675            ensure!(
3676                left_tag_col_set == right_tag_col_set,
3677                CombineTableColumnMismatchSnafu {
3678                    left: left_tag_col_set.into_iter().collect::<Vec<_>>(),
3679                    right: right_tag_col_set.into_iter().collect::<Vec<_>>(),
3680                }
3681            )
3682        };
3683        let left_time_index = left_context.time_index_column.clone().unwrap();
3684        let right_time_index = right_context.time_index_column.clone().unwrap();
3685        let join_keys = left_tag_col_set
3686            .iter()
3687            .cloned()
3688            .chain([left_time_index.clone()])
3689            .collect::<Vec<_>>();
3690        self.ctx.time_index_column = Some(left_time_index.clone());
3691        self.ctx.use_tsid = left_context.use_tsid;
3692
3693        // alias right time index column if necessary
3694        if left_context.time_index_column != right_context.time_index_column {
3695            let right_project_exprs = right
3696                .schema()
3697                .fields()
3698                .iter()
3699                .map(|field| {
3700                    if field.name() == &right_time_index {
3701                        DfExpr::Column(Column::from_name(&right_time_index)).alias(&left_time_index)
3702                    } else {
3703                        DfExpr::Column(Column::from_name(field.name()))
3704                    }
3705                })
3706                .collect::<Vec<_>>();
3707
3708            right = LogicalPlanBuilder::from(right)
3709                .project(right_project_exprs)
3710                .context(DataFusionPlanningSnafu)?
3711                .build()
3712                .context(DataFusionPlanningSnafu)?;
3713        }
3714
3715        ensure!(
3716            left_context.field_columns.len() == 1,
3717            MultiFieldsNotSupportedSnafu {
3718                operator: "AND operator"
3719            }
3720        );
3721        // Update the field column in context.
3722        // The AND/UNLESS operator only keep the field column in left input.
3723        let left_field_col = left_context.field_columns.first().unwrap();
3724        self.ctx.field_columns = vec![left_field_col.clone()];
3725
3726        // Generate join plan.
3727        // All set operations in PromQL are "distinct"
3728        match op.id() {
3729            token::T_LAND => LogicalPlanBuilder::from(left)
3730                .distinct()
3731                .context(DataFusionPlanningSnafu)?
3732                .join_detailed(
3733                    right,
3734                    JoinType::LeftSemi,
3735                    (join_keys.clone(), join_keys),
3736                    None,
3737                    NullEquality::NullEqualsNull,
3738                )
3739                .context(DataFusionPlanningSnafu)?
3740                .build()
3741                .context(DataFusionPlanningSnafu),
3742            token::T_LUNLESS => LogicalPlanBuilder::from(left)
3743                .distinct()
3744                .context(DataFusionPlanningSnafu)?
3745                .join_detailed(
3746                    right,
3747                    JoinType::LeftAnti,
3748                    (join_keys.clone(), join_keys),
3749                    None,
3750                    NullEquality::NullEqualsNull,
3751                )
3752                .context(DataFusionPlanningSnafu)?
3753                .build()
3754                .context(DataFusionPlanningSnafu),
3755            token::T_LOR => {
3756                // OR is handled at the beginning of this function, as it cannot
3757                // be expressed using JOIN like AND and UNLESS.
3758                unreachable!()
3759            }
3760            _ => UnexpectedTokenSnafu { token: op }.fail(),
3761        }
3762    }
3763
3764    // TODO(ruihang): change function name
3765    #[allow(clippy::too_many_arguments)]
3766    fn or_operator(
3767        &mut self,
3768        left: LogicalPlan,
3769        right: LogicalPlan,
3770        left_tag_cols_set: HashSet<String>,
3771        right_tag_cols_set: HashSet<String>,
3772        left_context: PromPlannerContext,
3773        right_context: PromPlannerContext,
3774        modifier: &Option<BinModifier>,
3775    ) -> Result<LogicalPlan> {
3776        // checks
3777        ensure!(
3778            left_context.field_columns.len() == right_context.field_columns.len(),
3779            CombineTableColumnMismatchSnafu {
3780                left: left_context.field_columns.clone(),
3781                right: right_context.field_columns.clone()
3782            }
3783        );
3784        ensure!(
3785            left_context.field_columns.len() == 1,
3786            MultiFieldsNotSupportedSnafu {
3787                operator: "OR operator"
3788            }
3789        );
3790
3791        // prepare hash sets
3792        let all_tags = left_tag_cols_set
3793            .union(&right_tag_cols_set)
3794            .cloned()
3795            .collect::<HashSet<_>>();
3796        let tags_not_in_left = all_tags
3797            .difference(&left_tag_cols_set)
3798            .cloned()
3799            .collect::<Vec<_>>();
3800        let tags_not_in_right = all_tags
3801            .difference(&right_tag_cols_set)
3802            .cloned()
3803            .collect::<Vec<_>>();
3804        let left_qualifier = left.schema().qualified_field(0).0.cloned();
3805        let right_qualifier = right.schema().qualified_field(0).0.cloned();
3806        let left_qualifier_string = left_qualifier
3807            .as_ref()
3808            .map(|l| l.to_string())
3809            .unwrap_or_default();
3810        let right_qualifier_string = right_qualifier
3811            .as_ref()
3812            .map(|r| r.to_string())
3813            .unwrap_or_default();
3814        let left_time_index_column =
3815            left_context
3816                .time_index_column
3817                .clone()
3818                .with_context(|| TimeIndexNotFoundSnafu {
3819                    table: left_qualifier_string.clone(),
3820                })?;
3821        let right_time_index_column =
3822            right_context
3823                .time_index_column
3824                .clone()
3825                .with_context(|| TimeIndexNotFoundSnafu {
3826                    table: right_qualifier_string.clone(),
3827                })?;
3828        // Take the name of first field column. The length is checked above.
3829        let left_field_col = left_context.field_columns.first().unwrap();
3830        let right_field_col = right_context.field_columns.first().unwrap();
3831        let left_has_tsid = left
3832            .schema()
3833            .fields()
3834            .iter()
3835            .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME);
3836        let right_has_tsid = right
3837            .schema()
3838            .fields()
3839            .iter()
3840            .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME);
3841
3842        // step 0: fill all columns in output schema
3843        let mut all_columns_set = left
3844            .schema()
3845            .fields()
3846            .iter()
3847            .chain(right.schema().fields().iter())
3848            .map(|field| field.name().clone())
3849            .collect::<HashSet<_>>();
3850        // Keep `__tsid` only when both sides contain it, otherwise it may break schema alignment
3851        // (e.g. `unknown_metric or some_metric`).
3852        if !(left_has_tsid && right_has_tsid) {
3853            all_columns_set.remove(DATA_SCHEMA_TSID_COLUMN_NAME);
3854        }
3855        // remove time index column
3856        all_columns_set.remove(&left_time_index_column);
3857        all_columns_set.remove(&right_time_index_column);
3858        // remove field column in the right
3859        if left_field_col != right_field_col {
3860            all_columns_set.remove(right_field_col);
3861        }
3862        let mut all_columns = all_columns_set.into_iter().collect::<Vec<_>>();
3863        // sort to ensure the generated schema is not volatile
3864        all_columns.sort_unstable();
3865        // use left time index column name as the result time index column name
3866        all_columns.insert(0, left_time_index_column.clone());
3867
3868        // step 1: align schema using project, fill non-exist columns with null
3869        let left_proj_exprs = all_columns.iter().map(|col| {
3870            if tags_not_in_left.contains(col) {
3871                DfExpr::Literal(ScalarValue::Utf8(None), None).alias(col.clone())
3872            } else {
3873                DfExpr::Column(Column::new(None::<String>, col))
3874            }
3875        });
3876        let right_time_index_expr = DfExpr::Column(Column::new(
3877            right_qualifier.clone(),
3878            right_time_index_column,
3879        ))
3880        .alias(left_time_index_column.clone());
3881        // The field column in right side may not have qualifier (it may be removed by join operation),
3882        // so we need to find it from the schema.
3883        let right_qualifier_for_field = right
3884            .schema()
3885            .iter()
3886            .find(|(_, f)| f.name() == right_field_col)
3887            .map(|(q, _)| q)
3888            .with_context(|| ColumnNotFoundSnafu {
3889                col: right_field_col.clone(),
3890            })?
3891            .cloned();
3892
3893        // `skip(1)` to skip the time index column
3894        let right_proj_exprs_without_time_index = all_columns.iter().skip(1).map(|col| {
3895            // expr
3896            if col == left_field_col && left_field_col != right_field_col {
3897                // qualify field in right side if necessary to handle different field name
3898                DfExpr::Column(Column::new(
3899                    right_qualifier_for_field.clone(),
3900                    right_field_col,
3901                ))
3902            } else if tags_not_in_right.contains(col) {
3903                DfExpr::Literal(ScalarValue::Utf8(None), None).alias(col.clone())
3904            } else {
3905                DfExpr::Column(Column::new(None::<String>, col))
3906            }
3907        });
3908        let right_proj_exprs = [right_time_index_expr]
3909            .into_iter()
3910            .chain(right_proj_exprs_without_time_index);
3911
3912        let left_projected = LogicalPlanBuilder::from(left)
3913            .project(left_proj_exprs)
3914            .context(DataFusionPlanningSnafu)?
3915            .alias(left_qualifier_string.clone())
3916            .context(DataFusionPlanningSnafu)?
3917            .build()
3918            .context(DataFusionPlanningSnafu)?;
3919        let right_projected = LogicalPlanBuilder::from(right)
3920            .project(right_proj_exprs)
3921            .context(DataFusionPlanningSnafu)?
3922            .alias(right_qualifier_string.clone())
3923            .context(DataFusionPlanningSnafu)?
3924            .build()
3925            .context(DataFusionPlanningSnafu)?;
3926
3927        // step 2: compute match columns
3928        let mut match_columns = if let Some(modifier) = modifier
3929            && let Some(matching) = &modifier.matching
3930        {
3931            match matching {
3932                // keeps columns mentioned in `on`
3933                LabelModifier::Include(on) => on.labels.clone(),
3934                // removes columns memtioned in `ignoring`
3935                LabelModifier::Exclude(ignoring) => {
3936                    let ignoring = ignoring.labels.iter().cloned().collect::<HashSet<_>>();
3937                    all_tags.difference(&ignoring).cloned().collect()
3938                }
3939            }
3940        } else {
3941            all_tags.iter().cloned().collect()
3942        };
3943        // sort to ensure the generated plan is not volatile
3944        match_columns.sort_unstable();
3945        // step 3: build `UnionDistinctOn` plan
3946        let schema = left_projected.schema().clone();
3947        let union_distinct_on = UnionDistinctOn::new(
3948            left_projected,
3949            right_projected,
3950            match_columns,
3951            left_time_index_column.clone(),
3952            schema,
3953        );
3954        let result = LogicalPlan::Extension(Extension {
3955            node: Arc::new(union_distinct_on),
3956        });
3957
3958        // step 4: update context
3959        self.ctx.time_index_column = Some(left_time_index_column);
3960        self.ctx.tag_columns = all_tags.into_iter().collect();
3961        self.ctx.field_columns = vec![left_field_col.clone()];
3962        self.ctx.use_tsid = left_has_tsid && right_has_tsid;
3963
3964        Ok(result)
3965    }
3966
3967    /// Build a projection that project and perform operation expr for every value columns.
3968    /// Non-value columns (tag and timestamp) will be preserved in the projection.
3969    ///
3970    /// # Side effect
3971    ///
3972    /// This function will update the value columns in the context. Those new column names
3973    /// don't contains qualifier.
3974    fn projection_for_each_field_column<F>(
3975        &mut self,
3976        input: LogicalPlan,
3977        name_to_expr: F,
3978    ) -> Result<LogicalPlan>
3979    where
3980        F: FnMut(&String) -> Result<DfExpr>,
3981    {
3982        let table_ref = self.ctx.table_name.clone().map(TableReference::bare);
3983        let non_field_columns_iter = self
3984            .ctx
3985            .tag_columns
3986            .iter()
3987            .chain(self.ctx.time_index_column.iter())
3988            .map(|col| Ok(DfExpr::Column(Column::new(table_ref.clone(), col))));
3989        let tsid_iter =
3990            Self::optional_tsid_projection(input.schema(), table_ref.as_ref(), self.ctx.use_tsid)
3991                .into_iter()
3992                .map(Ok);
3993
3994        // build computation exprs
3995        let result_field_columns = self
3996            .ctx
3997            .field_columns
3998            .iter()
3999            .map(name_to_expr)
4000            .collect::<Result<Vec<_>>>()?;
4001
4002        // alias the computation exprs to remove qualifier
4003        self.ctx.field_columns = result_field_columns
4004            .iter()
4005            .map(|expr| expr.schema_name().to_string())
4006            .collect();
4007        let field_columns_iter = result_field_columns
4008            .into_iter()
4009            .zip(self.ctx.field_columns.iter())
4010            .map(|(expr, name)| Ok(DfExpr::Alias(Alias::new(expr, None::<String>, name))));
4011
4012        // chain non-field columns (unchanged) and field columns (applied computation then alias)
4013        let project_fields = non_field_columns_iter
4014            .chain(tsid_iter)
4015            .chain(field_columns_iter)
4016            .collect::<Result<Vec<_>>>()?;
4017
4018        LogicalPlanBuilder::from(input)
4019            .project(project_fields)
4020            .context(DataFusionPlanningSnafu)?
4021            .build()
4022            .context(DataFusionPlanningSnafu)
4023    }
4024
4025    /// Build a filter plan that filter on value column. Notice that only one value column
4026    /// is expected.
4027    fn filter_on_field_column<F>(
4028        &self,
4029        input: LogicalPlan,
4030        mut name_to_expr: F,
4031    ) -> Result<LogicalPlan>
4032    where
4033        F: FnMut(&String) -> Result<DfExpr>,
4034    {
4035        ensure!(
4036            self.ctx.field_columns.len() == 1,
4037            UnsupportedExprSnafu {
4038                name: "filter on multi-value input"
4039            }
4040        );
4041
4042        let field_column_filter = name_to_expr(&self.ctx.field_columns[0])?;
4043
4044        LogicalPlanBuilder::from(input)
4045            .filter(field_column_filter)
4046            .context(DataFusionPlanningSnafu)?
4047            .build()
4048            .context(DataFusionPlanningSnafu)
4049    }
4050
4051    /// Generate an expr like `date_part("hour", <TIME_INDEX>)`. Caller should ensure the
4052    /// time index column in context is set
4053    fn date_part_on_time_index(&self, date_part: &str) -> Result<DfExpr> {
4054        let input_expr = datafusion::logical_expr::col(
4055            self.ctx
4056                .time_index_column
4057                .as_ref()
4058                // table name doesn't matters here
4059                .with_context(|| TimeIndexNotFoundSnafu {
4060                    table: "<doesn't matter>",
4061                })?,
4062        );
4063        let fn_expr = DfExpr::ScalarFunction(ScalarFunction {
4064            func: datafusion_functions::datetime::date_part(),
4065            args: vec![date_part.lit(), input_expr],
4066        });
4067        Ok(fn_expr)
4068    }
4069
4070    fn strip_tsid_column(&self, plan: LogicalPlan) -> Result<LogicalPlan> {
4071        let schema = plan.schema();
4072        if !schema
4073            .fields()
4074            .iter()
4075            .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME)
4076        {
4077            return Ok(plan);
4078        }
4079
4080        // Preserve column qualifiers so downstream plan nodes can keep referencing
4081        // the columns by their original qualified names.
4082        let project_exprs = schema
4083            .iter()
4084            .filter(|(_, field)| field.name() != DATA_SCHEMA_TSID_COLUMN_NAME)
4085            .map(|(qualifier, field)| {
4086                DfExpr::Column(Column::new(qualifier.cloned(), field.name().clone()))
4087            })
4088            .collect::<Vec<_>>();
4089
4090        LogicalPlanBuilder::from(plan)
4091            .project(project_exprs)
4092            .context(DataFusionPlanningSnafu)?
4093            .build()
4094            .context(DataFusionPlanningSnafu)
4095    }
4096
4097    /// Apply an alias to the query result by adding a projection with the alias name
4098    fn apply_alias(&mut self, plan: LogicalPlan, alias_name: String) -> Result<LogicalPlan> {
4099        let fields_expr = self.create_field_column_exprs()?;
4100
4101        // TODO(dennis): how to support multi-value aliasing?
4102        ensure!(
4103            fields_expr.len() == 1,
4104            UnsupportedExprSnafu {
4105                name: "alias on multi-value result"
4106            }
4107        );
4108
4109        let project_fields = fields_expr
4110            .into_iter()
4111            .map(|expr| expr.alias(&alias_name))
4112            .chain(self.create_tag_column_exprs()?)
4113            .chain(Some(self.create_time_index_column_expr()?));
4114
4115        LogicalPlanBuilder::from(plan)
4116            .project(project_fields)
4117            .context(DataFusionPlanningSnafu)?
4118            .build()
4119            .context(DataFusionPlanningSnafu)
4120    }
4121}
4122
4123#[derive(Default, Debug)]
4124struct FunctionArgs {
4125    input: Option<PromExpr>,
4126    literals: Vec<DfExpr>,
4127}
4128
4129/// Represents different types of scalar functions supported in PromQL expressions.
4130/// Each variant defines how the function should be processed and what arguments it expects.
4131#[derive(Debug, Clone)]
4132enum ScalarFunc {
4133    /// DataFusion's registered(including built-in) scalar functions (e.g., abs, sqrt, round, clamp).
4134    /// These are passed through directly to DataFusion's execution engine.
4135    /// Processing: Simple argument insertion at the specified position.
4136    DataFusionBuiltin(Arc<ScalarUdfDef>),
4137    /// User-defined functions registered in DataFusion's function registry.
4138    /// Similar to DataFusionBuiltin but for custom functions not built into DataFusion.
4139    /// Processing: Direct pass-through with argument positioning.
4140    DataFusionUdf(Arc<ScalarUdfDef>),
4141    /// PromQL-specific functions that operate on time series data with temporal context.
4142    /// These functions require both timestamp ranges and values to perform calculations.
4143    /// Processing: Automatically injects timestamp_range and value columns as first arguments.
4144    /// Examples: idelta, irate, resets, changes, deriv, *_over_time function
4145    Udf(Arc<ScalarUdfDef>),
4146    /// PromQL functions requiring extrapolation calculations with explicit range information.
4147    /// These functions need to know the time range length to perform rate calculations.
4148    /// The second field contains the range length in milliseconds.
4149    /// Processing: Injects timestamp_range, value, time_index columns and appends range_length.
4150    /// Examples: increase, rate, delta
4151    // TODO(ruihang): maybe merge with Udf later
4152    ExtrapolateUdf(Arc<ScalarUdfDef>, i64),
4153    /// Functions that generate expressions directly without external UDF calls.
4154    /// The expression is constructed during function matching and requires no additional processing.
4155    /// Examples: time(), minute(), hour(), month(), year() and other date/time extractors
4156    GeneratedExpr,
4157}
4158
4159#[cfg(test)]
4160mod test {
4161    use std::time::{Duration, UNIX_EPOCH};
4162
4163    use catalog::RegisterTableRequest;
4164    use catalog::memory::{MemoryCatalogManager, new_memory_catalog_manager};
4165    use common_base::Plugins;
4166    use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
4167    use common_query::prelude::greptime_timestamp;
4168    use common_query::test_util::DummyDecoder;
4169    use datafusion::arrow::datatypes::Schema as ArrowSchema;
4170    use datafusion::datasource::memory::MemorySourceConfig;
4171    use datafusion::datasource::source::DataSourceExec;
4172    use datafusion::logical_expr::Extension;
4173    use datatypes::prelude::ConcreteDataType;
4174    use datatypes::schema::{ColumnSchema, Schema};
4175    use promql_parser::label::Labels;
4176    use promql_parser::parser;
4177    use session::context::QueryContext;
4178    use table::metadata::{TableInfoBuilder, TableMetaBuilder};
4179    use table::test_util::EmptyTable;
4180
4181    use super::*;
4182    use crate::QueryEngineContext;
4183    use crate::options::QueryOptions;
4184    use crate::parser::QueryLanguageParser;
4185
4186    fn find_instant_manipulate(plan: &LogicalPlan) -> Option<&InstantManipulate> {
4187        if let LogicalPlan::Extension(Extension { node }) = plan
4188            && let Some(instant_manipulate) = node.as_any().downcast_ref::<InstantManipulate>()
4189        {
4190            return Some(instant_manipulate);
4191        }
4192
4193        plan.inputs().into_iter().find_map(find_instant_manipulate)
4194    }
4195
4196    fn build_query_engine_state() -> QueryEngineState {
4197        QueryEngineState::new(
4198            new_memory_catalog_manager().unwrap(),
4199            None,
4200            None,
4201            None,
4202            None,
4203            None,
4204            false,
4205            Plugins::default(),
4206            QueryOptions::default(),
4207        )
4208    }
4209
4210    async fn build_optimized_promql_plan(
4211        table_provider: DfTableSourceProvider,
4212        eval_stmt: &EvalStmt,
4213    ) -> LogicalPlan {
4214        let state = build_query_engine_state();
4215        let raw_plan = PromPlanner::stmt_to_plan(table_provider, eval_stmt, &state)
4216            .await
4217            .unwrap();
4218        let context = QueryEngineContext::new(state.session_state(), QueryContext::arc());
4219        state
4220            .optimize_by_extension_rules(raw_plan, &context)
4221            .unwrap()
4222    }
4223
4224    async fn build_optimized_tsid_plan(
4225        query: &str,
4226        num_tag: usize,
4227        num_field: usize,
4228        end_secs: u64,
4229        lookback_secs: u64,
4230    ) -> String {
4231        let eval_stmt = EvalStmt {
4232            expr: parser::parse(query).unwrap(),
4233            start: UNIX_EPOCH,
4234            end: UNIX_EPOCH
4235                .checked_add(Duration::from_secs(end_secs))
4236                .unwrap(),
4237            interval: Duration::from_secs(5),
4238            lookback_delta: Duration::from_secs(lookback_secs),
4239        };
4240        let table_provider = build_test_table_provider_with_tsid(
4241            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
4242            num_tag,
4243            num_field,
4244        )
4245        .await;
4246
4247        build_optimized_promql_plan(table_provider, &eval_stmt)
4248            .await
4249            .display_indent_schema()
4250            .to_string()
4251    }
4252
4253    async fn assert_nested_count_rewrite_applies(query: &str, expected_outer_agg: &str) {
4254        let plan_str = build_optimized_tsid_plan(query, 2, 1, 100_000, 1).await;
4255
4256        assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
4257        assert!(plan_str.contains("Projection: some_metric.timestamp, some_metric.tag_0"));
4258        assert!(plan_str.contains("Distinct:"));
4259        assert!(plan_str.contains(expected_outer_agg), "{plan_str}");
4260        assert!(!plan_str.contains("PromSeriesDivide: tags=[\"tag_0\"]"));
4261    }
4262
4263    async fn assert_nested_count_rewrite_missing(query: &str, num_tag: usize, lookback_secs: u64) {
4264        let plan_str = build_optimized_tsid_plan(query, num_tag, 1, 100_000, lookback_secs).await;
4265        assert!(!plan_str.contains("Distinct:"), "{plan_str}");
4266    }
4267
4268    fn build_eval_stmt(expr: &str) -> EvalStmt {
4269        EvalStmt {
4270            expr: parser::parse(expr).unwrap(),
4271            start: UNIX_EPOCH,
4272            end: UNIX_EPOCH
4273                .checked_add(Duration::from_secs(100_000))
4274                .unwrap(),
4275            interval: Duration::from_secs(5),
4276            lookback_delta: Duration::from_secs(1),
4277        }
4278    }
4279
4280    async fn build_test_table_provider(
4281        table_name_tuples: &[(String, String)],
4282        num_tag: usize,
4283        num_field: usize,
4284    ) -> DfTableSourceProvider {
4285        let catalog_list = MemoryCatalogManager::with_default_setup();
4286        for (schema_name, table_name) in table_name_tuples {
4287            let mut columns = vec![];
4288            for i in 0..num_tag {
4289                columns.push(ColumnSchema::new(
4290                    format!("tag_{i}"),
4291                    ConcreteDataType::string_datatype(),
4292                    false,
4293                ));
4294            }
4295            columns.push(
4296                ColumnSchema::new(
4297                    "timestamp".to_string(),
4298                    ConcreteDataType::timestamp_millisecond_datatype(),
4299                    false,
4300                )
4301                .with_time_index(true),
4302            );
4303            for i in 0..num_field {
4304                columns.push(ColumnSchema::new(
4305                    format!("field_{i}"),
4306                    ConcreteDataType::float64_datatype(),
4307                    true,
4308                ));
4309            }
4310            let schema = Arc::new(Schema::new(columns));
4311            let table_meta = TableMetaBuilder::empty()
4312                .schema(schema)
4313                .primary_key_indices((0..num_tag).collect())
4314                .value_indices((num_tag + 1..num_tag + 1 + num_field).collect())
4315                .next_column_id(1024)
4316                .build()
4317                .unwrap();
4318            let table_info = TableInfoBuilder::default()
4319                .name(table_name.clone())
4320                .meta(table_meta)
4321                .build()
4322                .unwrap();
4323            let table = EmptyTable::from_table_info(&table_info);
4324
4325            assert!(
4326                catalog_list
4327                    .register_table_sync(RegisterTableRequest {
4328                        catalog: DEFAULT_CATALOG_NAME.to_string(),
4329                        schema: schema_name.clone(),
4330                        table_name: table_name.clone(),
4331                        table_id: 1024,
4332                        table,
4333                    })
4334                    .is_ok()
4335            );
4336        }
4337
4338        DfTableSourceProvider::new(
4339            catalog_list,
4340            false,
4341            QueryContext::arc(),
4342            DummyDecoder::arc(),
4343            false,
4344        )
4345    }
4346
4347    async fn build_test_table_provider_with_tsid(
4348        table_name_tuples: &[(String, String)],
4349        num_tag: usize,
4350        num_field: usize,
4351    ) -> DfTableSourceProvider {
4352        let table_specs = table_name_tuples
4353            .iter()
4354            .map(|(schema_name, table_name)| ((schema_name.clone(), table_name.clone()), num_field))
4355            .collect::<Vec<_>>();
4356        build_test_table_provider_with_tsid_fields(&table_specs, num_tag).await
4357    }
4358
4359    async fn build_test_table_provider_with_tsid_fields(
4360        table_specs: &[((String, String), usize)],
4361        num_tag: usize,
4362    ) -> DfTableSourceProvider {
4363        let catalog_list = MemoryCatalogManager::with_default_setup();
4364
4365        let physical_table_name = "phy";
4366        let physical_table_id = 999u32;
4367        let physical_num_field = table_specs
4368            .iter()
4369            .map(|(_, num_field)| *num_field)
4370            .max()
4371            .unwrap_or(0);
4372
4373        // Register a metric engine physical table with internal columns.
4374        {
4375            let mut columns = vec![
4376                ColumnSchema::new(
4377                    DATA_SCHEMA_TABLE_ID_COLUMN_NAME.to_string(),
4378                    ConcreteDataType::uint32_datatype(),
4379                    false,
4380                ),
4381                ColumnSchema::new(
4382                    DATA_SCHEMA_TSID_COLUMN_NAME.to_string(),
4383                    ConcreteDataType::uint64_datatype(),
4384                    false,
4385                ),
4386            ];
4387            for i in 0..num_tag {
4388                columns.push(ColumnSchema::new(
4389                    format!("tag_{i}"),
4390                    ConcreteDataType::string_datatype(),
4391                    false,
4392                ));
4393            }
4394            columns.push(
4395                ColumnSchema::new(
4396                    "timestamp".to_string(),
4397                    ConcreteDataType::timestamp_millisecond_datatype(),
4398                    false,
4399                )
4400                .with_time_index(true),
4401            );
4402            for i in 0..physical_num_field {
4403                columns.push(ColumnSchema::new(
4404                    format!("field_{i}"),
4405                    ConcreteDataType::float64_datatype(),
4406                    true,
4407                ));
4408            }
4409
4410            let schema = Arc::new(Schema::new(columns));
4411            let primary_key_indices = (0..(2 + num_tag)).collect::<Vec<_>>();
4412            let table_meta = TableMetaBuilder::empty()
4413                .schema(schema)
4414                .primary_key_indices(primary_key_indices)
4415                .value_indices((2 + num_tag..2 + num_tag + 1 + physical_num_field).collect())
4416                .engine(METRIC_ENGINE_NAME.to_string())
4417                .next_column_id(1024)
4418                .build()
4419                .unwrap();
4420            let table_info = TableInfoBuilder::default()
4421                .table_id(physical_table_id)
4422                .name(physical_table_name)
4423                .meta(table_meta)
4424                .build()
4425                .unwrap();
4426            let table = EmptyTable::from_table_info(&table_info);
4427
4428            assert!(
4429                catalog_list
4430                    .register_table_sync(RegisterTableRequest {
4431                        catalog: DEFAULT_CATALOG_NAME.to_string(),
4432                        schema: DEFAULT_SCHEMA_NAME.to_string(),
4433                        table_name: physical_table_name.to_string(),
4434                        table_id: physical_table_id,
4435                        table,
4436                    })
4437                    .is_ok()
4438            );
4439        }
4440
4441        // Register metric engine logical tables without `__tsid`, referencing the physical table.
4442        for (idx, ((schema_name, table_name), num_field)) in table_specs.iter().enumerate() {
4443            let mut columns = vec![];
4444            for i in 0..num_tag {
4445                columns.push(ColumnSchema::new(
4446                    format!("tag_{i}"),
4447                    ConcreteDataType::string_datatype(),
4448                    false,
4449                ));
4450            }
4451            columns.push(
4452                ColumnSchema::new(
4453                    "timestamp".to_string(),
4454                    ConcreteDataType::timestamp_millisecond_datatype(),
4455                    false,
4456                )
4457                .with_time_index(true),
4458            );
4459            for i in 0..*num_field {
4460                columns.push(ColumnSchema::new(
4461                    format!("field_{i}"),
4462                    ConcreteDataType::float64_datatype(),
4463                    true,
4464                ));
4465            }
4466
4467            let schema = Arc::new(Schema::new(columns));
4468            let mut options = table::requests::TableOptions::default();
4469            options.extra_options.insert(
4470                LOGICAL_TABLE_METADATA_KEY.to_string(),
4471                physical_table_name.to_string(),
4472            );
4473            let table_id = 1024u32 + idx as u32;
4474            let table_meta = TableMetaBuilder::empty()
4475                .schema(schema)
4476                .primary_key_indices((0..num_tag).collect())
4477                .value_indices((num_tag + 1..num_tag + 1 + *num_field).collect())
4478                .engine(METRIC_ENGINE_NAME.to_string())
4479                .options(options)
4480                .next_column_id(1024)
4481                .build()
4482                .unwrap();
4483            let table_info = TableInfoBuilder::default()
4484                .table_id(table_id)
4485                .name(table_name.clone())
4486                .meta(table_meta)
4487                .build()
4488                .unwrap();
4489            let table = EmptyTable::from_table_info(&table_info);
4490
4491            assert!(
4492                catalog_list
4493                    .register_table_sync(RegisterTableRequest {
4494                        catalog: DEFAULT_CATALOG_NAME.to_string(),
4495                        schema: schema_name.clone(),
4496                        table_name: table_name.clone(),
4497                        table_id,
4498                        table,
4499                    })
4500                    .is_ok()
4501            );
4502        }
4503
4504        DfTableSourceProvider::new(
4505            catalog_list,
4506            false,
4507            QueryContext::arc(),
4508            DummyDecoder::arc(),
4509            false,
4510        )
4511    }
4512
4513    async fn build_test_table_provider_with_fields(
4514        table_name_tuples: &[(String, String)],
4515        tags: &[&str],
4516    ) -> DfTableSourceProvider {
4517        let catalog_list = MemoryCatalogManager::with_default_setup();
4518        for (schema_name, table_name) in table_name_tuples {
4519            let mut columns = vec![];
4520            let num_tag = tags.len();
4521            for tag in tags {
4522                columns.push(ColumnSchema::new(
4523                    tag.to_string(),
4524                    ConcreteDataType::string_datatype(),
4525                    false,
4526                ));
4527            }
4528            columns.push(
4529                ColumnSchema::new(
4530                    greptime_timestamp().to_string(),
4531                    ConcreteDataType::timestamp_millisecond_datatype(),
4532                    false,
4533                )
4534                .with_time_index(true),
4535            );
4536            columns.push(ColumnSchema::new(
4537                greptime_value().to_string(),
4538                ConcreteDataType::float64_datatype(),
4539                true,
4540            ));
4541            let schema = Arc::new(Schema::new(columns));
4542            let table_meta = TableMetaBuilder::empty()
4543                .schema(schema)
4544                .primary_key_indices((0..num_tag).collect())
4545                .next_column_id(1024)
4546                .build()
4547                .unwrap();
4548            let table_info = TableInfoBuilder::default()
4549                .name(table_name.clone())
4550                .meta(table_meta)
4551                .build()
4552                .unwrap();
4553            let table = EmptyTable::from_table_info(&table_info);
4554
4555            assert!(
4556                catalog_list
4557                    .register_table_sync(RegisterTableRequest {
4558                        catalog: DEFAULT_CATALOG_NAME.to_string(),
4559                        schema: schema_name.clone(),
4560                        table_name: table_name.clone(),
4561                        table_id: 1024,
4562                        table,
4563                    })
4564                    .is_ok()
4565            );
4566        }
4567
4568        DfTableSourceProvider::new(
4569            catalog_list,
4570            false,
4571            QueryContext::arc(),
4572            DummyDecoder::arc(),
4573            false,
4574        )
4575    }
4576
4577    // {
4578    //     input: `abs(some_metric{foo!="bar"})`,
4579    //     expected: &Call{
4580    //         Func: MustGetFunction("abs"),
4581    //         Args: Expressions{
4582    //             &VectorSelector{
4583    //                 Name: "some_metric",
4584    //                 LabelMatchers: []*labels.Matcher{
4585    //                     MustLabelMatcher(labels.MatchNotEqual, "foo", "bar"),
4586    //                     MustLabelMatcher(labels.MatchEqual, model.MetricNameLabel, "some_metric"),
4587    //                 },
4588    //             },
4589    //         },
4590    //     },
4591    // },
4592    async fn do_single_instant_function_call(fn_name: &'static str, plan_name: &str) {
4593        let prom_expr =
4594            parser::parse(&format!("{fn_name}(some_metric{{tag_0!=\"bar\"}})")).unwrap();
4595        let eval_stmt = EvalStmt {
4596            expr: prom_expr,
4597            start: UNIX_EPOCH,
4598            end: UNIX_EPOCH
4599                .checked_add(Duration::from_secs(100_000))
4600                .unwrap(),
4601            interval: Duration::from_secs(5),
4602            lookback_delta: Duration::from_secs(1),
4603        };
4604
4605        let table_provider = build_test_table_provider(
4606            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
4607            1,
4608            1,
4609        )
4610        .await;
4611        let plan =
4612            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
4613                .await
4614                .unwrap();
4615
4616        let expected = String::from(
4617            "Filter: TEMPLATE(field_0) IS NOT NULL [timestamp:Timestamp(ms), TEMPLATE(field_0):Float64;N, tag_0:Utf8]\
4618            \n  Projection: some_metric.timestamp, TEMPLATE(some_metric.field_0) AS TEMPLATE(field_0), some_metric.tag_0 [timestamp:Timestamp(ms), TEMPLATE(field_0):Float64;N, tag_0:Utf8]\
4619            \n    PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
4620            \n      PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
4621            \n        Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
4622	            \n          Filter: some_metric.tag_0 != Utf8(\"bar\") AND some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
4623            \n            TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]"
4624        ).replace("TEMPLATE", plan_name);
4625
4626        assert_eq!(plan.display_indent_schema().to_string(), expected);
4627    }
4628
4629    #[tokio::test]
4630    async fn single_abs() {
4631        do_single_instant_function_call("abs", "abs").await;
4632    }
4633
4634    #[tokio::test]
4635    #[should_panic]
4636    async fn single_absent() {
4637        do_single_instant_function_call("absent", "").await;
4638    }
4639
4640    #[tokio::test]
4641    async fn single_ceil() {
4642        do_single_instant_function_call("ceil", "ceil").await;
4643    }
4644
4645    #[tokio::test]
4646    async fn single_exp() {
4647        do_single_instant_function_call("exp", "exp").await;
4648    }
4649
4650    #[tokio::test]
4651    async fn single_ln() {
4652        do_single_instant_function_call("ln", "ln").await;
4653    }
4654
4655    #[tokio::test]
4656    async fn single_log2() {
4657        do_single_instant_function_call("log2", "log2").await;
4658    }
4659
4660    #[tokio::test]
4661    async fn single_log10() {
4662        do_single_instant_function_call("log10", "log10").await;
4663    }
4664
4665    #[tokio::test]
4666    #[should_panic]
4667    async fn single_scalar() {
4668        do_single_instant_function_call("scalar", "").await;
4669    }
4670
4671    #[tokio::test]
4672    #[should_panic]
4673    async fn single_sgn() {
4674        do_single_instant_function_call("sgn", "").await;
4675    }
4676
4677    #[tokio::test]
4678    #[should_panic]
4679    async fn single_sort() {
4680        do_single_instant_function_call("sort", "").await;
4681    }
4682
4683    #[tokio::test]
4684    #[should_panic]
4685    async fn single_sort_desc() {
4686        do_single_instant_function_call("sort_desc", "").await;
4687    }
4688
4689    #[tokio::test]
4690    async fn single_sqrt() {
4691        do_single_instant_function_call("sqrt", "sqrt").await;
4692    }
4693
4694    #[tokio::test]
4695    #[should_panic]
4696    async fn single_timestamp() {
4697        do_single_instant_function_call("timestamp", "").await;
4698    }
4699
4700    #[tokio::test]
4701    async fn single_acos() {
4702        do_single_instant_function_call("acos", "acos").await;
4703    }
4704
4705    #[tokio::test]
4706    #[should_panic]
4707    async fn single_acosh() {
4708        do_single_instant_function_call("acosh", "").await;
4709    }
4710
4711    #[tokio::test]
4712    async fn single_asin() {
4713        do_single_instant_function_call("asin", "asin").await;
4714    }
4715
4716    #[tokio::test]
4717    #[should_panic]
4718    async fn single_asinh() {
4719        do_single_instant_function_call("asinh", "").await;
4720    }
4721
4722    #[tokio::test]
4723    async fn single_atan() {
4724        do_single_instant_function_call("atan", "atan").await;
4725    }
4726
4727    #[tokio::test]
4728    #[should_panic]
4729    async fn single_atanh() {
4730        do_single_instant_function_call("atanh", "").await;
4731    }
4732
4733    #[tokio::test]
4734    async fn single_cos() {
4735        do_single_instant_function_call("cos", "cos").await;
4736    }
4737
4738    #[tokio::test]
4739    #[should_panic]
4740    async fn single_cosh() {
4741        do_single_instant_function_call("cosh", "").await;
4742    }
4743
4744    #[tokio::test]
4745    async fn single_sin() {
4746        do_single_instant_function_call("sin", "sin").await;
4747    }
4748
4749    #[tokio::test]
4750    #[should_panic]
4751    async fn single_sinh() {
4752        do_single_instant_function_call("sinh", "").await;
4753    }
4754
4755    #[tokio::test]
4756    async fn single_tan() {
4757        do_single_instant_function_call("tan", "tan").await;
4758    }
4759
4760    #[tokio::test]
4761    #[should_panic]
4762    async fn single_tanh() {
4763        do_single_instant_function_call("tanh", "").await;
4764    }
4765
4766    #[tokio::test]
4767    #[should_panic]
4768    async fn single_deg() {
4769        do_single_instant_function_call("deg", "").await;
4770    }
4771
4772    #[tokio::test]
4773    #[should_panic]
4774    async fn single_rad() {
4775        do_single_instant_function_call("rad", "").await;
4776    }
4777
4778    // {
4779    //     input: "avg by (foo)(some_metric)",
4780    //     expected: &AggregateExpr{
4781    //         Op: AVG,
4782    //         Expr: &VectorSelector{
4783    //             Name: "some_metric",
4784    //             LabelMatchers: []*labels.Matcher{
4785    //                 MustLabelMatcher(labels.MatchEqual, model.MetricNameLabel, "some_metric"),
4786    //             },
4787    //             PosRange: PositionRange{
4788    //                 Start: 13,
4789    //                 End:   24,
4790    //             },
4791    //         },
4792    //         Grouping: []string{"foo"},
4793    //         PosRange: PositionRange{
4794    //             Start: 0,
4795    //             End:   25,
4796    //         },
4797    //     },
4798    // },
4799    async fn do_aggregate_expr_plan(fn_name: &str, plan_name: &str) {
4800        let prom_expr = parser::parse(&format!(
4801            "{fn_name} by (tag_1)(some_metric{{tag_0!=\"bar\"}})",
4802        ))
4803        .unwrap();
4804        let mut eval_stmt = EvalStmt {
4805            expr: prom_expr,
4806            start: UNIX_EPOCH,
4807            end: UNIX_EPOCH
4808                .checked_add(Duration::from_secs(100_000))
4809                .unwrap(),
4810            interval: Duration::from_secs(5),
4811            lookback_delta: Duration::from_secs(1),
4812        };
4813
4814        // test group by
4815        let table_provider = build_test_table_provider(
4816            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
4817            2,
4818            2,
4819        )
4820        .await;
4821        let plan =
4822            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
4823                .await
4824                .unwrap();
4825        let expected_no_without = String::from(
4826            "Sort: some_metric.tag_1 ASC NULLS LAST, some_metric.timestamp ASC NULLS LAST [tag_1:Utf8, timestamp:Timestamp(ms), TEMPLATE(some_metric.field_0):Float64;N, TEMPLATE(some_metric.field_1):Float64;N]\
4827            \n  Aggregate: groupBy=[[some_metric.tag_1, some_metric.timestamp]], aggr=[[TEMPLATE(some_metric.field_0), TEMPLATE(some_metric.field_1)]] [tag_1:Utf8, timestamp:Timestamp(ms), TEMPLATE(some_metric.field_0):Float64;N, TEMPLATE(some_metric.field_1):Float64;N]\
4828            \n    PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
4829            \n      PromSeriesDivide: tags=[\"tag_0\", \"tag_1\"] [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
4830            \n        Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.tag_1 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
4831            \n          Filter: some_metric.tag_0 != Utf8(\"bar\") AND some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
4832            \n            TableScan: some_metric [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]"
4833        ).replace("TEMPLATE", plan_name);
4834        assert_eq!(
4835            plan.display_indent_schema().to_string(),
4836            expected_no_without
4837        );
4838
4839        // test group without
4840        if let PromExpr::Aggregate(AggregateExpr { modifier, .. }) = &mut eval_stmt.expr {
4841            *modifier = Some(LabelModifier::Exclude(Labels {
4842                labels: vec![String::from("tag_1")].into_iter().collect(),
4843            }));
4844        }
4845        let table_provider = build_test_table_provider(
4846            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
4847            2,
4848            2,
4849        )
4850        .await;
4851        let plan =
4852            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
4853                .await
4854                .unwrap();
4855        let expected_without = String::from(
4856            "Sort: some_metric.tag_0 ASC NULLS LAST, some_metric.timestamp ASC NULLS LAST [tag_0:Utf8, timestamp:Timestamp(ms), TEMPLATE(some_metric.field_0):Float64;N, TEMPLATE(some_metric.field_1):Float64;N]\
4857            \n  Aggregate: groupBy=[[some_metric.tag_0, some_metric.timestamp]], aggr=[[TEMPLATE(some_metric.field_0), TEMPLATE(some_metric.field_1)]] [tag_0:Utf8, timestamp:Timestamp(ms), TEMPLATE(some_metric.field_0):Float64;N, TEMPLATE(some_metric.field_1):Float64;N]\
4858            \n    PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
4859            \n      PromSeriesDivide: tags=[\"tag_0\", \"tag_1\"] [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
4860            \n        Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.tag_1 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
4861            \n          Filter: some_metric.tag_0 != Utf8(\"bar\") AND some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
4862            \n            TableScan: some_metric [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]"
4863        ).replace("TEMPLATE", plan_name);
4864        assert_eq!(plan.display_indent_schema().to_string(), expected_without);
4865    }
4866
4867    #[tokio::test]
4868    async fn aggregate_sum() {
4869        do_aggregate_expr_plan("sum", "sum").await;
4870    }
4871
4872    #[tokio::test]
4873    async fn tsid_is_used_for_series_divide_when_available() {
4874        let prom_expr = parser::parse("some_metric").unwrap();
4875        let eval_stmt = EvalStmt {
4876            expr: prom_expr,
4877            start: UNIX_EPOCH,
4878            end: UNIX_EPOCH
4879                .checked_add(Duration::from_secs(100_000))
4880                .unwrap(),
4881            interval: Duration::from_secs(5),
4882            lookback_delta: Duration::from_secs(1),
4883        };
4884
4885        let table_provider = build_test_table_provider_with_tsid(
4886            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
4887            1,
4888            1,
4889        )
4890        .await;
4891        let plan =
4892            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
4893                .await
4894                .unwrap();
4895
4896        let plan_str = plan.display_indent_schema().to_string();
4897        assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
4898        assert!(plan_str.contains("__tsid ASC NULLS FIRST"));
4899        assert!(
4900            !plan
4901                .schema()
4902                .fields()
4903                .iter()
4904                .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME)
4905        );
4906
4907        let manipulate = find_instant_manipulate(&plan).unwrap();
4908        let exec = manipulate.to_execution_plan(Arc::new(DataSourceExec::new(Arc::new(
4909            MemorySourceConfig::try_new(&[], Arc::new(ArrowSchema::empty()), None).unwrap(),
4910        ))));
4911        assert!(format!("{exec:?}").contains("reuse_tsid_column: true"));
4912    }
4913
4914    #[tokio::test]
4915    async fn default_binary_join_uses_tsid_when_available() {
4916        let eval_stmt = build_eval_stmt("some_metric / some_alt_metric");
4917
4918        let table_provider = build_test_table_provider_with_tsid(
4919            &[
4920                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
4921                (
4922                    DEFAULT_SCHEMA_NAME.to_string(),
4923                    "some_alt_metric".to_string(),
4924                ),
4925            ],
4926            1,
4927            1,
4928        )
4929        .await;
4930        let plan =
4931            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
4932                .await
4933                .unwrap();
4934
4935        let plan_str = plan.display_indent_schema().to_string();
4936        assert!(
4937            plan_str.contains("some_metric.__tsid = some_alt_metric.__tsid"),
4938            "{plan_str}"
4939        );
4940        assert!(
4941            !plan_str.contains("some_metric.tag_0 = some_alt_metric.tag_0"),
4942            "{plan_str}"
4943        );
4944    }
4945
4946    #[tokio::test]
4947    async fn tsid_is_preserved_for_nested_default_binary_joins() {
4948        let eval_stmt = build_eval_stmt("(some_metric - some_alt_metric) / some_third_metric");
4949
4950        let table_provider = build_test_table_provider_with_tsid(
4951            &[
4952                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
4953                (
4954                    DEFAULT_SCHEMA_NAME.to_string(),
4955                    "some_alt_metric".to_string(),
4956                ),
4957                (
4958                    DEFAULT_SCHEMA_NAME.to_string(),
4959                    "some_third_metric".to_string(),
4960                ),
4961            ],
4962            1,
4963            1,
4964        )
4965        .await;
4966        let plan =
4967            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
4968                .await
4969                .unwrap();
4970
4971        let plan_str = plan.display_indent_schema().to_string();
4972        assert_eq!(plan_str.matches("__tsid =").count(), 2, "{plan_str}");
4973        assert!(!plan_str.contains("tag_0 ="), "{plan_str}");
4974    }
4975
4976    #[tokio::test]
4977    async fn repeated_tsid_binary_operand_keeps_tsid_join_keys() {
4978        let eval_stmt = build_eval_stmt("((some_metric - some_alt_metric) / some_metric) * 100");
4979
4980        let table_provider = build_test_table_provider_with_tsid(
4981            &[
4982                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
4983                (
4984                    DEFAULT_SCHEMA_NAME.to_string(),
4985                    "some_alt_metric".to_string(),
4986                ),
4987            ],
4988            1,
4989            1,
4990        )
4991        .await;
4992        let plan =
4993            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
4994                .await
4995                .unwrap();
4996
4997        let plan_str = plan.display_indent_schema().to_string();
4998        assert_eq!(plan_str.matches("__tsid =").count(), 2, "{plan_str}");
4999        assert!(!plan_str.contains("tag_0 ="), "{plan_str}");
5000    }
5001
5002    #[tokio::test]
5003    async fn repeated_tsid_binary_operand_keeps_shorter_field_side() {
5004        let eval_stmt =
5005            build_eval_stmt("((two_field_metric - one_field_metric) / one_field_metric) * 100");
5006
5007        let table_provider = build_test_table_provider_with_tsid_fields(
5008            &[
5009                (
5010                    (
5011                        DEFAULT_SCHEMA_NAME.to_string(),
5012                        "two_field_metric".to_string(),
5013                    ),
5014                    2,
5015                ),
5016                (
5017                    (
5018                        DEFAULT_SCHEMA_NAME.to_string(),
5019                        "one_field_metric".to_string(),
5020                    ),
5021                    1,
5022                ),
5023            ],
5024            1,
5025        )
5026        .await;
5027        let plan =
5028            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5029                .await
5030                .unwrap();
5031
5032        let field_names = plan
5033            .schema()
5034            .fields()
5035            .iter()
5036            .map(|field| field.name().clone())
5037            .collect::<Vec<_>>();
5038        let value_columns = field_names
5039            .iter()
5040            .filter(|name| {
5041                *name != "tag_0" && *name != "timestamp" && *name != DATA_SCHEMA_TSID_COLUMN_NAME
5042            })
5043            .count();
5044        assert_eq!(value_columns, 1, "{field_names:?}");
5045        let plan_str = plan.display_indent_schema().to_string();
5046        assert_eq!(plan_str.matches("__tsid =").count(), 2, "{plan_str}");
5047        assert!(!plan_str.contains("tag_0 ="), "{plan_str}");
5048    }
5049
5050    #[tokio::test]
5051    async fn tsid_binary_join_uses_shorter_field_side() {
5052        let eval_stmt = build_eval_stmt("one_field_metric / two_field_metric");
5053
5054        let table_provider = build_test_table_provider_with_tsid_fields(
5055            &[
5056                (
5057                    (
5058                        DEFAULT_SCHEMA_NAME.to_string(),
5059                        "one_field_metric".to_string(),
5060                    ),
5061                    1,
5062                ),
5063                (
5064                    (
5065                        DEFAULT_SCHEMA_NAME.to_string(),
5066                        "two_field_metric".to_string(),
5067                    ),
5068                    2,
5069                ),
5070            ],
5071            1,
5072        )
5073        .await;
5074        let plan =
5075            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5076                .await
5077                .unwrap();
5078
5079        let field_names = plan
5080            .schema()
5081            .fields()
5082            .iter()
5083            .map(|field| field.name().clone())
5084            .collect::<Vec<_>>();
5085        let value_columns = field_names
5086            .iter()
5087            .filter(|name| {
5088                *name != "tag_0" && *name != "timestamp" && *name != DATA_SCHEMA_TSID_COLUMN_NAME
5089            })
5090            .count();
5091        assert_eq!(value_columns, 1, "{field_names:?}");
5092    }
5093
5094    #[tokio::test]
5095    async fn label_matching_modifier_disables_tsid_binary_join() {
5096        let eval_stmt = build_eval_stmt("some_metric / ignoring(tag_0) some_alt_metric");
5097
5098        let table_provider = build_test_table_provider_with_tsid(
5099            &[
5100                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
5101                (
5102                    DEFAULT_SCHEMA_NAME.to_string(),
5103                    "some_alt_metric".to_string(),
5104                ),
5105            ],
5106            2,
5107            1,
5108        )
5109        .await;
5110        let plan =
5111            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5112                .await
5113                .unwrap();
5114
5115        let plan_str = plan.display_indent_schema().to_string();
5116        assert!(!plan_str.contains("__tsid ="), "{plan_str}");
5117        assert!(
5118            plan_str.contains("some_metric.tag_1 = some_alt_metric.tag_1"),
5119            "{plan_str}"
5120        );
5121    }
5122
5123    #[tokio::test]
5124    async fn comparison_binary_join_uses_tsid_and_keeps_it_in_filtered_result() {
5125        let eval_stmt = build_eval_stmt("some_metric > some_alt_metric");
5126
5127        let table_provider = build_test_table_provider_with_tsid(
5128            &[
5129                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
5130                (
5131                    DEFAULT_SCHEMA_NAME.to_string(),
5132                    "some_alt_metric".to_string(),
5133                ),
5134            ],
5135            2,
5136            1,
5137        )
5138        .await;
5139        let mut planner = PromPlanner {
5140            table_provider,
5141            ctx: PromPlannerContext::from_eval_stmt(&eval_stmt),
5142        };
5143        let plan = planner
5144            .prom_expr_to_plan(&eval_stmt.expr, &build_query_engine_state())
5145            .await
5146            .unwrap();
5147
5148        let plan_str = plan.display_indent_schema().to_string();
5149        assert!(
5150            plan_str.contains("some_metric.__tsid = some_alt_metric.__tsid"),
5151            "{plan_str}"
5152        );
5153        assert!(
5154            plan.schema()
5155                .fields()
5156                .iter()
5157                .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME),
5158            "{plan_str}"
5159        );
5160        assert!(planner.ctx.use_tsid, "{plan_str}");
5161    }
5162
5163    #[tokio::test]
5164    async fn comparison_bool_binary_join_uses_tsid_when_available() {
5165        let eval_stmt = build_eval_stmt("some_metric > bool some_alt_metric");
5166
5167        let table_provider = build_test_table_provider_with_tsid(
5168            &[
5169                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
5170                (
5171                    DEFAULT_SCHEMA_NAME.to_string(),
5172                    "some_alt_metric".to_string(),
5173                ),
5174            ],
5175            2,
5176            1,
5177        )
5178        .await;
5179        let plan =
5180            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5181                .await
5182                .unwrap();
5183
5184        let plan_str = plan.display_indent_schema().to_string();
5185        assert!(
5186            plan_str.contains("some_metric.__tsid = some_alt_metric.__tsid"),
5187            "{plan_str}"
5188        );
5189        assert!(!plan_str.contains("tag_0 ="), "{plan_str}");
5190        assert!(!plan_str.contains("tag_1 ="), "{plan_str}");
5191    }
5192
5193    #[tokio::test]
5194    async fn scalar_count_count_range_keeps_full_window() {
5195        let plan_str = build_optimized_tsid_plan(
5196            "scalar(count(count(some_metric) by (tag_0)))",
5197            1,
5198            1,
5199            100_000,
5200            1,
5201        )
5202        .await;
5203        assert!(plan_str.contains("ScalarCalculate: tags=[]"));
5204        assert!(plan_str.contains("PromInstantManipulate: range=[0..100000000]"));
5205        assert!(!plan_str.contains("PromInstantManipulate: range=[99999000..99999000]"));
5206    }
5207
5208    #[tokio::test]
5209    async fn scalar_count_count_rewrite_applies_inside_binary_expr_for_tsid_input() {
5210        let plan_str = build_optimized_tsid_plan(
5211            "sum(irate(some_metric[1h])) / scalar(count(count(some_metric) by (tag_0)))",
5212            2,
5213            1,
5214            10,
5215            300,
5216        )
5217        .await;
5218        assert!(plan_str.contains("Distinct:"), "{plan_str}");
5219    }
5220
5221    #[tokio::test]
5222    async fn nested_count_rewrite_keeps_full_series_key_with_tsid_input() {
5223        assert_nested_count_rewrite_applies(
5224            "count(count(some_metric) by (tag_0))",
5225            "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(count(some_metric.field_0))]]"
5226        )
5227        .await;
5228    }
5229
5230    #[tokio::test]
5231    async fn nested_sum_count_rewrite_keeps_full_series_key_with_tsid_input() {
5232        assert_nested_count_rewrite_applies(
5233            "count(sum(some_metric) by (tag_0))",
5234            "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(sum(some_metric.field_0))]]"
5235        )
5236        .await;
5237    }
5238
5239    #[tokio::test]
5240    async fn nested_supported_inner_aggs_rewrite_apply_for_tsid_input() {
5241        for (query, expected_outer_agg) in [
5242            (
5243                "count(avg(some_metric) by (tag_0))",
5244                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(avg(some_metric.field_0))]]",
5245            ),
5246            (
5247                "count(min(some_metric) by (tag_0))",
5248                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(min(some_metric.field_0))]]",
5249            ),
5250            (
5251                "count(max(some_metric) by (tag_0))",
5252                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(max(some_metric.field_0))]]",
5253            ),
5254            (
5255                "count(stddev(some_metric) by (tag_0))",
5256                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(stddev_pop(some_metric.field_0))]]",
5257            ),
5258            (
5259                "count(stdvar(some_metric) by (tag_0))",
5260                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(var_pop(some_metric.field_0))]]",
5261            ),
5262        ] {
5263            assert_nested_count_rewrite_applies(query, expected_outer_agg).await;
5264        }
5265    }
5266
5267    #[tokio::test]
5268    async fn nested_non_count_inner_aggs_rewrite_filter_null_values_for_tsid_input() {
5269        let count_plan =
5270            build_optimized_tsid_plan("count(count(some_metric) by (tag_0))", 2, 1, 100_000, 1)
5271                .await;
5272        assert!(
5273            !count_plan.contains("some_metric.field_0 IS NOT NULL"),
5274            "{count_plan}"
5275        );
5276
5277        for query in [
5278            "count(sum(some_metric) by (tag_0))",
5279            "count(avg(some_metric) by (tag_0))",
5280            "count(min(some_metric) by (tag_0))",
5281            "count(max(some_metric) by (tag_0))",
5282            "count(stddev(some_metric) by (tag_0))",
5283            "count(stdvar(some_metric) by (tag_0))",
5284        ] {
5285            let plan_str = build_optimized_tsid_plan(query, 2, 1, 100_000, 1).await;
5286            assert!(
5287                plan_str.contains("Filter: some_metric.field_0 IS NOT NULL"),
5288                "{query}: {plan_str}"
5289            );
5290        }
5291    }
5292
5293    #[tokio::test]
5294    async fn nested_unsupported_or_non_direct_inner_aggs_do_not_rewrite() {
5295        assert_nested_count_rewrite_missing("count(group(some_metric) by (tag_0))", 2, 1).await;
5296        assert_nested_count_rewrite_missing(
5297            "count(sum(irate(some_metric[1h])) by (tag_0))",
5298            2,
5299            300,
5300        )
5301        .await;
5302    }
5303
5304    #[tokio::test]
5305    async fn physical_table_name_is_not_leaked_in_plan() {
5306        let prom_expr = parser::parse("some_metric").unwrap();
5307        let eval_stmt = EvalStmt {
5308            expr: prom_expr,
5309            start: UNIX_EPOCH,
5310            end: UNIX_EPOCH
5311                .checked_add(Duration::from_secs(100_000))
5312                .unwrap(),
5313            interval: Duration::from_secs(5),
5314            lookback_delta: Duration::from_secs(1),
5315        };
5316
5317        let table_provider = build_test_table_provider_with_tsid(
5318            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5319            1,
5320            1,
5321        )
5322        .await;
5323        let plan =
5324            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5325                .await
5326                .unwrap();
5327
5328        let plan_str = plan.display_indent_schema().to_string();
5329        assert!(plan_str.contains("TableScan: phy"), "{plan}");
5330        assert!(plan_str.contains("SubqueryAlias: some_metric"));
5331        assert!(plan_str.contains("Filter: phy.__table_id = UInt32(1024)"));
5332        assert!(!plan_str.contains("TableScan: some_metric"));
5333    }
5334
5335    #[tokio::test]
5336    async fn sum_without_does_not_group_by_tsid() {
5337        let prom_expr = parser::parse("sum without (tag_0) (some_metric)").unwrap();
5338        let eval_stmt = EvalStmt {
5339            expr: prom_expr,
5340            start: UNIX_EPOCH,
5341            end: UNIX_EPOCH
5342                .checked_add(Duration::from_secs(100_000))
5343                .unwrap(),
5344            interval: Duration::from_secs(5),
5345            lookback_delta: Duration::from_secs(1),
5346        };
5347
5348        let table_provider = build_test_table_provider_with_tsid(
5349            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5350            1,
5351            1,
5352        )
5353        .await;
5354        let plan =
5355            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5356                .await
5357                .unwrap();
5358
5359        let plan_str = plan.display_indent_schema().to_string();
5360        assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
5361
5362        let aggr_line = plan_str
5363            .lines()
5364            .find(|line| line.contains("Aggregate: groupBy="))
5365            .unwrap();
5366        assert!(!aggr_line.contains(DATA_SCHEMA_TSID_COLUMN_NAME));
5367    }
5368
5369    #[tokio::test]
5370    async fn topk_without_does_not_partition_by_tsid() {
5371        let prom_expr = parser::parse("topk without (tag_0) (1, some_metric)").unwrap();
5372        let eval_stmt = EvalStmt {
5373            expr: prom_expr,
5374            start: UNIX_EPOCH,
5375            end: UNIX_EPOCH
5376                .checked_add(Duration::from_secs(100_000))
5377                .unwrap(),
5378            interval: Duration::from_secs(5),
5379            lookback_delta: Duration::from_secs(1),
5380        };
5381
5382        let table_provider = build_test_table_provider_with_tsid(
5383            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5384            1,
5385            1,
5386        )
5387        .await;
5388        let plan =
5389            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5390                .await
5391                .unwrap();
5392
5393        let plan_str = plan.display_indent_schema().to_string();
5394        assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
5395
5396        let window_line = plan_str
5397            .lines()
5398            .find(|line| line.contains("WindowAggr: windowExpr=[[row_number()"))
5399            .unwrap();
5400        let partition_by = window_line
5401            .split("PARTITION BY [")
5402            .nth(1)
5403            .and_then(|s| s.split("] ORDER BY").next())
5404            .unwrap();
5405        assert!(!partition_by.contains(DATA_SCHEMA_TSID_COLUMN_NAME));
5406    }
5407
5408    #[tokio::test]
5409    async fn sum_by_does_not_group_by_tsid() {
5410        let prom_expr = parser::parse("sum by (__tsid) (some_metric)").unwrap();
5411        let eval_stmt = EvalStmt {
5412            expr: prom_expr,
5413            start: UNIX_EPOCH,
5414            end: UNIX_EPOCH
5415                .checked_add(Duration::from_secs(100_000))
5416                .unwrap(),
5417            interval: Duration::from_secs(5),
5418            lookback_delta: Duration::from_secs(1),
5419        };
5420
5421        let table_provider = build_test_table_provider_with_tsid(
5422            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5423            1,
5424            1,
5425        )
5426        .await;
5427        let plan =
5428            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5429                .await
5430                .unwrap();
5431
5432        let plan_str = plan.display_indent_schema().to_string();
5433        assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
5434
5435        let aggr_line = plan_str
5436            .lines()
5437            .find(|line| line.contains("Aggregate: groupBy="))
5438            .unwrap();
5439        assert!(!aggr_line.contains(DATA_SCHEMA_TSID_COLUMN_NAME));
5440    }
5441
5442    #[tokio::test]
5443    async fn topk_by_does_not_partition_by_tsid() {
5444        let prom_expr = parser::parse("topk by (__tsid) (1, some_metric)").unwrap();
5445        let eval_stmt = EvalStmt {
5446            expr: prom_expr,
5447            start: UNIX_EPOCH,
5448            end: UNIX_EPOCH
5449                .checked_add(Duration::from_secs(100_000))
5450                .unwrap(),
5451            interval: Duration::from_secs(5),
5452            lookback_delta: Duration::from_secs(1),
5453        };
5454
5455        let table_provider = build_test_table_provider_with_tsid(
5456            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5457            1,
5458            1,
5459        )
5460        .await;
5461        let plan =
5462            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5463                .await
5464                .unwrap();
5465
5466        let plan_str = plan.display_indent_schema().to_string();
5467        assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
5468
5469        let window_line = plan_str
5470            .lines()
5471            .find(|line| line.contains("WindowAggr: windowExpr=[[row_number()"))
5472            .unwrap();
5473        let partition_by = window_line
5474            .split("PARTITION BY [")
5475            .nth(1)
5476            .and_then(|s| s.split("] ORDER BY").next())
5477            .unwrap();
5478        assert!(!partition_by.contains(DATA_SCHEMA_TSID_COLUMN_NAME));
5479    }
5480
5481    #[tokio::test]
5482    async fn selector_matcher_on_tsid_does_not_use_internal_column() {
5483        let prom_expr = parser::parse(r#"some_metric{__tsid="123"}"#).unwrap();
5484        let eval_stmt = EvalStmt {
5485            expr: prom_expr,
5486            start: UNIX_EPOCH,
5487            end: UNIX_EPOCH
5488                .checked_add(Duration::from_secs(100_000))
5489                .unwrap(),
5490            interval: Duration::from_secs(5),
5491            lookback_delta: Duration::from_secs(1),
5492        };
5493
5494        let table_provider = build_test_table_provider_with_tsid(
5495            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5496            1,
5497            1,
5498        )
5499        .await;
5500        let plan =
5501            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5502                .await
5503                .unwrap();
5504
5505        fn collect_filter_cols(plan: &LogicalPlan, out: &mut HashSet<Column>) {
5506            if let LogicalPlan::Filter(filter) = plan {
5507                datafusion_expr::utils::expr_to_columns(&filter.predicate, out).unwrap();
5508            }
5509            for input in plan.inputs() {
5510                collect_filter_cols(input, out);
5511            }
5512        }
5513
5514        let mut filter_cols = HashSet::new();
5515        collect_filter_cols(&plan, &mut filter_cols);
5516        assert!(
5517            !filter_cols
5518                .iter()
5519                .any(|c| c.name == DATA_SCHEMA_TSID_COLUMN_NAME)
5520        );
5521    }
5522
5523    #[tokio::test]
5524    async fn tsid_is_not_used_when_physical_table_is_missing() {
5525        let prom_expr = parser::parse("some_metric").unwrap();
5526        let eval_stmt = EvalStmt {
5527            expr: prom_expr,
5528            start: UNIX_EPOCH,
5529            end: UNIX_EPOCH
5530                .checked_add(Duration::from_secs(100_000))
5531                .unwrap(),
5532            interval: Duration::from_secs(5),
5533            lookback_delta: Duration::from_secs(1),
5534        };
5535
5536        let catalog_list = MemoryCatalogManager::with_default_setup();
5537
5538        // Register a metric engine logical table referencing a missing physical table.
5539        let mut columns = vec![ColumnSchema::new(
5540            "tag_0".to_string(),
5541            ConcreteDataType::string_datatype(),
5542            false,
5543        )];
5544        columns.push(
5545            ColumnSchema::new(
5546                "timestamp".to_string(),
5547                ConcreteDataType::timestamp_millisecond_datatype(),
5548                false,
5549            )
5550            .with_time_index(true),
5551        );
5552        columns.push(ColumnSchema::new(
5553            "field_0".to_string(),
5554            ConcreteDataType::float64_datatype(),
5555            true,
5556        ));
5557        let schema = Arc::new(Schema::new(columns));
5558        let mut options = table::requests::TableOptions::default();
5559        options
5560            .extra_options
5561            .insert(LOGICAL_TABLE_METADATA_KEY.to_string(), "phy".to_string());
5562        let table_meta = TableMetaBuilder::empty()
5563            .schema(schema)
5564            .primary_key_indices(vec![0])
5565            .value_indices(vec![2])
5566            .engine(METRIC_ENGINE_NAME.to_string())
5567            .options(options)
5568            .next_column_id(1024)
5569            .build()
5570            .unwrap();
5571        let table_info = TableInfoBuilder::default()
5572            .table_id(1024)
5573            .name("some_metric")
5574            .meta(table_meta)
5575            .build()
5576            .unwrap();
5577        let table = EmptyTable::from_table_info(&table_info);
5578        catalog_list
5579            .register_table_sync(RegisterTableRequest {
5580                catalog: DEFAULT_CATALOG_NAME.to_string(),
5581                schema: DEFAULT_SCHEMA_NAME.to_string(),
5582                table_name: "some_metric".to_string(),
5583                table_id: 1024,
5584                table,
5585            })
5586            .unwrap();
5587
5588        let table_provider = DfTableSourceProvider::new(
5589            catalog_list,
5590            false,
5591            QueryContext::arc(),
5592            DummyDecoder::arc(),
5593            false,
5594        );
5595
5596        let plan =
5597            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5598                .await
5599                .unwrap();
5600
5601        let plan_str = plan.display_indent_schema().to_string();
5602        assert!(plan_str.contains("PromSeriesDivide: tags=[\"tag_0\"]"));
5603        assert!(!plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
5604    }
5605
5606    #[tokio::test]
5607    async fn tsid_is_carried_only_when_aggregate_preserves_label_set() {
5608        let prom_expr = parser::parse("sum by (tag_0) (some_metric)").unwrap();
5609        let eval_stmt = EvalStmt {
5610            expr: prom_expr,
5611            start: UNIX_EPOCH,
5612            end: UNIX_EPOCH
5613                .checked_add(Duration::from_secs(100_000))
5614                .unwrap(),
5615            interval: Duration::from_secs(5),
5616            lookback_delta: Duration::from_secs(1),
5617        };
5618
5619        let table_provider = build_test_table_provider_with_tsid(
5620            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5621            1,
5622            1,
5623        )
5624        .await;
5625        let plan =
5626            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5627                .await
5628                .unwrap();
5629
5630        let plan_str = plan.display_indent_schema().to_string();
5631        assert!(plan_str.contains("first_value") && plan_str.contains("__tsid"));
5632        assert!(
5633            !plan
5634                .schema()
5635                .fields()
5636                .iter()
5637                .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME)
5638        );
5639
5640        // Merging aggregate: label set is reduced, tsid should not be carried.
5641        let prom_expr = parser::parse("sum(some_metric)").unwrap();
5642        let eval_stmt = EvalStmt {
5643            expr: prom_expr,
5644            start: UNIX_EPOCH,
5645            end: UNIX_EPOCH
5646                .checked_add(Duration::from_secs(100_000))
5647                .unwrap(),
5648            interval: Duration::from_secs(5),
5649            lookback_delta: Duration::from_secs(1),
5650        };
5651        let table_provider = build_test_table_provider_with_tsid(
5652            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5653            1,
5654            1,
5655        )
5656        .await;
5657        let plan =
5658            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5659                .await
5660                .unwrap();
5661        let plan_str = plan.display_indent_schema().to_string();
5662        assert!(!plan_str.contains("first_value"));
5663    }
5664
5665    #[tokio::test]
5666    async fn or_operator_with_unknown_metric_does_not_require_tsid() {
5667        let prom_expr = parser::parse("unknown_metric or some_metric").unwrap();
5668        let eval_stmt = EvalStmt {
5669            expr: prom_expr,
5670            start: UNIX_EPOCH,
5671            end: UNIX_EPOCH
5672                .checked_add(Duration::from_secs(100_000))
5673                .unwrap(),
5674            interval: Duration::from_secs(5),
5675            lookback_delta: Duration::from_secs(1),
5676        };
5677
5678        let table_provider = build_test_table_provider_with_tsid(
5679            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5680            1,
5681            1,
5682        )
5683        .await;
5684
5685        let plan =
5686            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5687                .await
5688                .unwrap();
5689
5690        assert!(
5691            !plan
5692                .schema()
5693                .fields()
5694                .iter()
5695                .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME)
5696        );
5697    }
5698
5699    #[tokio::test]
5700    async fn aggregate_avg() {
5701        do_aggregate_expr_plan("avg", "avg").await;
5702    }
5703
5704    #[tokio::test]
5705    #[should_panic] // output type doesn't match
5706    async fn aggregate_count() {
5707        do_aggregate_expr_plan("count", "count").await;
5708    }
5709
5710    #[tokio::test]
5711    async fn aggregate_min() {
5712        do_aggregate_expr_plan("min", "min").await;
5713    }
5714
5715    #[tokio::test]
5716    async fn aggregate_max() {
5717        do_aggregate_expr_plan("max", "max").await;
5718    }
5719
5720    #[tokio::test]
5721    async fn aggregate_group() {
5722        // Regression test for `group()` aggregator.
5723        // PromQL: sum(group by (cluster)(kubernetes_build_info{service="kubernetes",job="apiserver"}))
5724        // should be plannable, and `group()` should produce constant 1 for each group.
5725        let prom_expr = parser::parse(
5726            "sum(group by (cluster)(kubernetes_build_info{service=\"kubernetes\",job=\"apiserver\"}))",
5727        )
5728        .unwrap();
5729        let eval_stmt = EvalStmt {
5730            expr: prom_expr,
5731            start: UNIX_EPOCH,
5732            end: UNIX_EPOCH
5733                .checked_add(Duration::from_secs(100_000))
5734                .unwrap(),
5735            interval: Duration::from_secs(5),
5736            lookback_delta: Duration::from_secs(1),
5737        };
5738
5739        let table_provider = build_test_table_provider_with_fields(
5740            &[(
5741                DEFAULT_SCHEMA_NAME.to_string(),
5742                "kubernetes_build_info".to_string(),
5743            )],
5744            &["cluster", "service", "job"],
5745        )
5746        .await;
5747        let plan =
5748            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5749                .await
5750                .unwrap();
5751
5752        let plan_str = plan.display_indent_schema().to_string();
5753        assert!(plan_str.contains("max(Float64(1"));
5754    }
5755
5756    #[tokio::test]
5757    async fn aggregate_stddev() {
5758        do_aggregate_expr_plan("stddev", "stddev_pop").await;
5759    }
5760
5761    #[tokio::test]
5762    async fn aggregate_stdvar() {
5763        do_aggregate_expr_plan("stdvar", "var_pop").await;
5764    }
5765
5766    // TODO(ruihang): add range fn tests once exprs are ready.
5767
5768    // {
5769    //     input: "some_metric{tag_0="foo"} + some_metric{tag_0="bar"}",
5770    //     expected: &BinaryExpr{
5771    //         Op: ADD,
5772    //         LHS: &VectorSelector{
5773    //             Name: "a",
5774    //             LabelMatchers: []*labels.Matcher{
5775    //                     MustLabelMatcher(labels.MatchEqual, "tag_0", "foo"),
5776    //                     MustLabelMatcher(labels.MatchEqual, model.MetricNameLabel, "some_metric"),
5777    //             },
5778    //         },
5779    //         RHS: &VectorSelector{
5780    //             Name: "sum",
5781    //             LabelMatchers: []*labels.Matcher{
5782    //                     MustLabelMatcher(labels.MatchxEqual, "tag_0", "bar"),
5783    //                     MustLabelMatcher(labels.MatchEqual, model.MetricNameLabel, "some_metric"),
5784    //             },
5785    //         },
5786    //         VectorMatching: &VectorMatching{},
5787    //     },
5788    // },
5789    #[tokio::test]
5790    async fn binary_op_column_column() {
5791        let prom_expr =
5792            parser::parse(r#"some_metric{tag_0="foo"} + some_metric{tag_0="bar"}"#).unwrap();
5793        let eval_stmt = EvalStmt {
5794            expr: prom_expr,
5795            start: UNIX_EPOCH,
5796            end: UNIX_EPOCH
5797                .checked_add(Duration::from_secs(100_000))
5798                .unwrap(),
5799            interval: Duration::from_secs(5),
5800            lookback_delta: Duration::from_secs(1),
5801        };
5802
5803        let table_provider = build_test_table_provider(
5804            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5805            1,
5806            1,
5807        )
5808        .await;
5809        let plan =
5810            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5811                .await
5812                .unwrap();
5813
5814        let expected = String::from(
5815            "Projection: rhs.tag_0, rhs.timestamp, CAST(lhs.field_0 AS Float64) + CAST(rhs.field_0 AS Float64) AS lhs.field_0 + rhs.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), lhs.field_0 + rhs.field_0:Float64;N]\
5816            \n  Inner Join: lhs.tag_0 = rhs.tag_0, lhs.timestamp = rhs.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5817            \n    SubqueryAlias: lhs [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5818            \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5819            \n        PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5820            \n          Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5821            \n            Filter: some_metric.tag_0 = Utf8(\"foo\") AND some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5822            \n              TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5823            \n    SubqueryAlias: rhs [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5824            \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5825            \n        PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5826            \n          Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5827            \n            Filter: some_metric.tag_0 = Utf8(\"bar\") AND some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5828            \n              TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
5829        );
5830
5831        assert_eq!(plan.display_indent_schema().to_string(), expected);
5832    }
5833
5834    async fn indie_query_plan_compare<T: AsRef<str>>(query: &str, expected: T) {
5835        let prom_expr = parser::parse(query).unwrap();
5836        let eval_stmt = EvalStmt {
5837            expr: prom_expr,
5838            start: UNIX_EPOCH,
5839            end: UNIX_EPOCH
5840                .checked_add(Duration::from_secs(100_000))
5841                .unwrap(),
5842            interval: Duration::from_secs(5),
5843            lookback_delta: Duration::from_secs(1),
5844        };
5845
5846        let table_provider = build_test_table_provider(
5847            &[
5848                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
5849                (
5850                    "greptime_private".to_string(),
5851                    "some_alt_metric".to_string(),
5852                ),
5853            ],
5854            1,
5855            1,
5856        )
5857        .await;
5858        let plan =
5859            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5860                .await
5861                .unwrap();
5862
5863        assert_eq!(plan.display_indent_schema().to_string(), expected.as_ref());
5864    }
5865
5866    #[tokio::test]
5867    async fn binary_op_literal_column() {
5868        let query = r#"1 + some_metric{tag_0="bar"}"#;
5869        let expected = String::from(
5870            "Projection: some_metric.tag_0, some_metric.timestamp, Float64(1) + CAST(some_metric.field_0 AS Float64) AS Float64(1) + field_0 [tag_0:Utf8, timestamp:Timestamp(ms), Float64(1) + field_0:Float64;N]\
5871            \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5872            \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5873            \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5874            \n        Filter: some_metric.tag_0 = Utf8(\"bar\") AND some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5875            \n          TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
5876        );
5877
5878        indie_query_plan_compare(query, expected).await;
5879    }
5880
5881    #[tokio::test]
5882    async fn binary_op_literal_literal() {
5883        let query = r#"1 + 1"#;
5884        let expected = r#"EmptyMetric: range=[0..100000000], interval=[5000] [time:Timestamp(ms), value:Float64;N]
5885  TableScan: dummy [time:Timestamp(ms), value:Float64;N]"#;
5886        indie_query_plan_compare(query, expected).await;
5887    }
5888
5889    #[tokio::test]
5890    async fn simple_bool_grammar() {
5891        let query = "some_metric != bool 1.2345";
5892        let expected = String::from(
5893            "Projection: some_metric.tag_0, some_metric.timestamp, CAST(some_metric.field_0 != Float64(1.2345) AS Float64) AS field_0 != Float64(1.2345) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 != Float64(1.2345):Float64;N]\
5894            \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5895            \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5896            \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5897            \n        Filter: some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5898            \n          TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
5899        );
5900
5901        indie_query_plan_compare(query, expected).await;
5902    }
5903
5904    #[tokio::test]
5905    async fn bool_with_additional_arithmetic() {
5906        let query = "some_metric + (1 == bool 2)";
5907        let expected = String::from(
5908            "Projection: some_metric.tag_0, some_metric.timestamp, CAST(some_metric.field_0 AS Float64) + CAST(Float64(1) = Float64(2) AS Float64) AS field_0 + Float64(1) = Float64(2) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 + Float64(1) = Float64(2):Float64;N]\
5909            \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5910            \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5911            \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5912            \n        Filter: some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5913            \n          TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
5914        );
5915
5916        indie_query_plan_compare(query, expected).await;
5917    }
5918
5919    #[tokio::test]
5920    async fn simple_unary() {
5921        let query = "-some_metric";
5922        let expected = String::from(
5923            "Projection: some_metric.tag_0, some_metric.timestamp, (- some_metric.field_0) AS (- field_0) [tag_0:Utf8, timestamp:Timestamp(ms), (- field_0):Float64;N]\
5924            \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5925            \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5926            \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5927            \n        Filter: some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5928            \n          TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
5929        );
5930
5931        indie_query_plan_compare(query, expected).await;
5932    }
5933
5934    #[tokio::test]
5935    async fn increase_aggr() {
5936        let query = "increase(some_metric[5m])";
5937        let expected = String::from(
5938            "Filter: prom_increase(timestamp_range,field_0,timestamp,Int64(300000)) IS NOT NULL [timestamp:Timestamp(ms), prom_increase(timestamp_range,field_0,timestamp,Int64(300000)):Float64;N, tag_0:Utf8]\
5939            \n  Projection: some_metric.timestamp, prom_increase(timestamp_range, field_0, some_metric.timestamp, Int64(300000)) AS prom_increase(timestamp_range,field_0,timestamp,Int64(300000)), some_metric.tag_0 [timestamp:Timestamp(ms), prom_increase(timestamp_range,field_0,timestamp,Int64(300000)):Float64;N, tag_0:Utf8]\
5940            \n    PromRangeManipulate: req range=[0..100000000], interval=[5000], eval range=[300000], time index=[timestamp], values=[\"field_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Dictionary(Int64, Float64);N, timestamp_range:Dictionary(Int64, Timestamp(ms))]\
5941            \n      PromSeriesNormalize: offset=[0], time index=[timestamp], filter NaN: [true] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5942            \n        PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5943            \n          Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5944            \n            Filter: some_metric.timestamp >= TimestampMillisecond(-299999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5945            \n              TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
5946        );
5947
5948        indie_query_plan_compare(query, expected).await;
5949    }
5950
5951    #[tokio::test]
5952    async fn less_filter_on_value() {
5953        let query = "some_metric < 1.2345";
5954        let expected = String::from(
5955            "Filter: some_metric.field_0 < Float64(1.2345) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5956            \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5957            \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5958            \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5959            \n        Filter: some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5960            \n          TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
5961        );
5962
5963        indie_query_plan_compare(query, expected).await;
5964    }
5965
5966    #[tokio::test]
5967    async fn count_over_time() {
5968        let query = "count_over_time(some_metric[5m])";
5969        let expected = String::from(
5970            "Filter: prom_count_over_time(timestamp_range,field_0) IS NOT NULL [timestamp:Timestamp(ms), prom_count_over_time(timestamp_range,field_0):Float64;N, tag_0:Utf8]\
5971            \n  Projection: some_metric.timestamp, prom_count_over_time(timestamp_range, field_0) AS prom_count_over_time(timestamp_range,field_0), some_metric.tag_0 [timestamp:Timestamp(ms), prom_count_over_time(timestamp_range,field_0):Float64;N, tag_0:Utf8]\
5972            \n    PromRangeManipulate: req range=[0..100000000], interval=[5000], eval range=[300000], time index=[timestamp], values=[\"field_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Dictionary(Int64, Float64);N, timestamp_range:Dictionary(Int64, Timestamp(ms))]\
5973            \n      PromSeriesNormalize: offset=[0], time index=[timestamp], filter NaN: [true] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5974            \n        PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5975            \n          Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5976            \n            Filter: some_metric.timestamp >= TimestampMillisecond(-299999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5977            \n              TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
5978        );
5979
5980        indie_query_plan_compare(query, expected).await;
5981    }
5982
5983    /// The outer `PromRangeManipulate` from a subquery must be preceded by
5984    /// `Sort` + `PromSeriesDivide`.
5985    #[tokio::test]
5986    async fn count_over_time_subquery() {
5987        let query = "count_over_time(some_metric[10m:1m])";
5988        let expected = String::from(
5989            "Filter: prom_count_over_time(timestamp_range,field_0) IS NOT NULL [timestamp:Timestamp(ms), prom_count_over_time(timestamp_range,field_0):Float64;N, tag_0:Utf8]\
5990            \n  Projection: some_metric.timestamp, prom_count_over_time(timestamp_range, field_0) AS prom_count_over_time(timestamp_range,field_0), some_metric.tag_0 [timestamp:Timestamp(ms), prom_count_over_time(timestamp_range,field_0):Float64;N, tag_0:Utf8]\
5991            \n    PromRangeManipulate: req range=[0..100000000], interval=[5000], eval range=[600000], time index=[timestamp], values=[\"field_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Dictionary(Int64, Float64);N, timestamp_range:Dictionary(Int64, Timestamp(ms))]\
5992            \n      PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5993            \n        Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5994            \n          PromInstantManipulate: range=[-540000..100000000], lookback=[1000], interval=[60000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5995            \n            PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5996            \n              Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5997            \n                Filter: some_metric.timestamp >= TimestampMillisecond(-540999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5998            \n                  TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
5999        );
6000        indie_query_plan_compare(query, expected).await;
6001    }
6002
6003    #[tokio::test]
6004    async fn test_hash_join() {
6005        let mut eval_stmt = EvalStmt {
6006            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
6007            start: UNIX_EPOCH,
6008            end: UNIX_EPOCH
6009                .checked_add(Duration::from_secs(100_000))
6010                .unwrap(),
6011            interval: Duration::from_secs(5),
6012            lookback_delta: Duration::from_secs(1),
6013        };
6014
6015        let case = r#"http_server_requests_seconds_sum{uri="/accounts/login"} / ignoring(kubernetes_pod_name,kubernetes_namespace) http_server_requests_seconds_count{uri="/accounts/login"}"#;
6016
6017        let prom_expr = parser::parse(case).unwrap();
6018        eval_stmt.expr = prom_expr;
6019        let table_provider = build_test_table_provider_with_fields(
6020            &[
6021                (
6022                    DEFAULT_SCHEMA_NAME.to_string(),
6023                    "http_server_requests_seconds_sum".to_string(),
6024                ),
6025                (
6026                    DEFAULT_SCHEMA_NAME.to_string(),
6027                    "http_server_requests_seconds_count".to_string(),
6028                ),
6029            ],
6030            &["uri", "kubernetes_namespace", "kubernetes_pod_name"],
6031        )
6032        .await;
6033        // Should be ok
6034        let plan =
6035            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6036                .await
6037                .unwrap();
6038        let expected = "Projection: http_server_requests_seconds_count.uri, http_server_requests_seconds_count.kubernetes_namespace, http_server_requests_seconds_count.kubernetes_pod_name, http_server_requests_seconds_count.greptime_timestamp, CAST(http_server_requests_seconds_sum.greptime_value AS Float64) / CAST(http_server_requests_seconds_count.greptime_value AS Float64) AS http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value\
6039            \n  Inner Join: http_server_requests_seconds_sum.greptime_timestamp = http_server_requests_seconds_count.greptime_timestamp, http_server_requests_seconds_sum.uri = http_server_requests_seconds_count.uri\
6040            \n    SubqueryAlias: http_server_requests_seconds_sum\
6041            \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp]\
6042            \n        PromSeriesDivide: tags=[\"uri\", \"kubernetes_namespace\", \"kubernetes_pod_name\"]\
6043            \n          Sort: http_server_requests_seconds_sum.uri ASC NULLS FIRST, http_server_requests_seconds_sum.kubernetes_namespace ASC NULLS FIRST, http_server_requests_seconds_sum.kubernetes_pod_name ASC NULLS FIRST, http_server_requests_seconds_sum.greptime_timestamp ASC NULLS FIRST\
6044            \n            Filter: http_server_requests_seconds_sum.uri = Utf8(\"/accounts/login\") AND http_server_requests_seconds_sum.greptime_timestamp >= TimestampMillisecond(-999, None) AND http_server_requests_seconds_sum.greptime_timestamp <= TimestampMillisecond(100000000, None)\
6045            \n              TableScan: http_server_requests_seconds_sum\
6046            \n    SubqueryAlias: http_server_requests_seconds_count\
6047            \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp]\
6048            \n        PromSeriesDivide: tags=[\"uri\", \"kubernetes_namespace\", \"kubernetes_pod_name\"]\
6049            \n          Sort: http_server_requests_seconds_count.uri ASC NULLS FIRST, http_server_requests_seconds_count.kubernetes_namespace ASC NULLS FIRST, http_server_requests_seconds_count.kubernetes_pod_name ASC NULLS FIRST, http_server_requests_seconds_count.greptime_timestamp ASC NULLS FIRST\
6050            \n            Filter: http_server_requests_seconds_count.uri = Utf8(\"/accounts/login\") AND http_server_requests_seconds_count.greptime_timestamp >= TimestampMillisecond(-999, None) AND http_server_requests_seconds_count.greptime_timestamp <= TimestampMillisecond(100000000, None)\
6051            \n              TableScan: http_server_requests_seconds_count";
6052        assert_eq!(plan.to_string(), expected);
6053    }
6054
6055    #[tokio::test]
6056    async fn test_nested_histogram_quantile() {
6057        let mut eval_stmt = EvalStmt {
6058            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
6059            start: UNIX_EPOCH,
6060            end: UNIX_EPOCH
6061                .checked_add(Duration::from_secs(100_000))
6062                .unwrap(),
6063            interval: Duration::from_secs(5),
6064            lookback_delta: Duration::from_secs(1),
6065        };
6066
6067        let case = r#"label_replace(histogram_quantile(0.99, sum by(pod, le, path, code) (rate(greptime_servers_grpc_requests_elapsed_bucket{container="frontend"}[1m0s]))), "pod_new", "$1", "pod", "greptimedb-frontend-[0-9a-z]*-(.*)")"#;
6068
6069        let prom_expr = parser::parse(case).unwrap();
6070        eval_stmt.expr = prom_expr;
6071        let table_provider = build_test_table_provider_with_fields(
6072            &[(
6073                DEFAULT_SCHEMA_NAME.to_string(),
6074                "greptime_servers_grpc_requests_elapsed_bucket".to_string(),
6075            )],
6076            &["pod", "le", "path", "code", "container"],
6077        )
6078        .await;
6079        // Should be ok
6080        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6081            .await
6082            .unwrap();
6083    }
6084
6085    #[tokio::test]
6086    async fn test_histogram_quantile_binary_op() {
6087        let mut eval_stmt = EvalStmt {
6088            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
6089            start: UNIX_EPOCH,
6090            end: UNIX_EPOCH
6091                .checked_add(Duration::from_secs(100_000))
6092                .unwrap(),
6093            interval: Duration::from_secs(5),
6094            lookback_delta: Duration::from_secs(1),
6095        };
6096
6097        // Arithmetic applied to a histogram_quantile() result. Regression for #8144:
6098        // HistogramFold used to drop the input column qualifiers, so the binary-op
6099        // projection failed to resolve the qualified tag column.
6100        let case = r#"histogram_quantile(0.5, sum by (le, pod) (rate(http_request_duration_seconds_bucket[5m]))) + 0"#;
6101
6102        let prom_expr = parser::parse(case).unwrap();
6103        eval_stmt.expr = prom_expr;
6104        let table_provider = build_test_table_provider_with_fields(
6105            &[(
6106                DEFAULT_SCHEMA_NAME.to_string(),
6107                "http_request_duration_seconds_bucket".to_string(),
6108            )],
6109            &["pod", "le"],
6110        )
6111        .await;
6112        // Should plan without a "No field named ..." error.
6113        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6114            .await
6115            .unwrap();
6116    }
6117
6118    #[tokio::test]
6119    async fn test_parse_and_operator() {
6120        let mut eval_stmt = EvalStmt {
6121            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
6122            start: UNIX_EPOCH,
6123            end: UNIX_EPOCH
6124                .checked_add(Duration::from_secs(100_000))
6125                .unwrap(),
6126            interval: Duration::from_secs(5),
6127            lookback_delta: Duration::from_secs(1),
6128        };
6129
6130        let cases = [
6131            r#"count (max by (persistentvolumeclaim,namespace) (kubelet_volume_stats_used_bytes{namespace=~".+"} ) and (max by (persistentvolumeclaim,namespace) (kubelet_volume_stats_used_bytes{namespace=~".+"} )) / (max by (persistentvolumeclaim,namespace) (kubelet_volume_stats_capacity_bytes{namespace=~".+"} )) >= (80 / 100)) or vector (0)"#,
6132            r#"count (max by (persistentvolumeclaim,namespace) (kubelet_volume_stats_used_bytes{namespace=~".+"} ) unless (max by (persistentvolumeclaim,namespace) (kubelet_volume_stats_used_bytes{namespace=~".+"} )) / (max by (persistentvolumeclaim,namespace) (kubelet_volume_stats_capacity_bytes{namespace=~".+"} )) >= (80 / 100)) or vector (0)"#,
6133        ];
6134
6135        for case in cases {
6136            let prom_expr = parser::parse(case).unwrap();
6137            eval_stmt.expr = prom_expr;
6138            let table_provider = build_test_table_provider_with_fields(
6139                &[
6140                    (
6141                        DEFAULT_SCHEMA_NAME.to_string(),
6142                        "kubelet_volume_stats_used_bytes".to_string(),
6143                    ),
6144                    (
6145                        DEFAULT_SCHEMA_NAME.to_string(),
6146                        "kubelet_volume_stats_capacity_bytes".to_string(),
6147                    ),
6148                ],
6149                &["namespace", "persistentvolumeclaim"],
6150            )
6151            .await;
6152            // Should be ok
6153            let _ =
6154                PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6155                    .await
6156                    .unwrap();
6157        }
6158    }
6159
6160    #[tokio::test]
6161    async fn test_nested_binary_op() {
6162        let mut eval_stmt = EvalStmt {
6163            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
6164            start: UNIX_EPOCH,
6165            end: UNIX_EPOCH
6166                .checked_add(Duration::from_secs(100_000))
6167                .unwrap(),
6168            interval: Duration::from_secs(5),
6169            lookback_delta: Duration::from_secs(1),
6170        };
6171
6172        let case = r#"sum(rate(nginx_ingress_controller_requests{job=~".*"}[2m])) -
6173        (
6174            sum(rate(nginx_ingress_controller_requests{namespace=~".*"}[2m]))
6175            or
6176            vector(0)
6177        )"#;
6178
6179        let prom_expr = parser::parse(case).unwrap();
6180        eval_stmt.expr = prom_expr;
6181        let table_provider = build_test_table_provider_with_fields(
6182            &[(
6183                DEFAULT_SCHEMA_NAME.to_string(),
6184                "nginx_ingress_controller_requests".to_string(),
6185            )],
6186            &["namespace", "job"],
6187        )
6188        .await;
6189        // Should be ok
6190        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6191            .await
6192            .unwrap();
6193    }
6194
6195    #[tokio::test]
6196    async fn test_parse_or_operator() {
6197        let mut eval_stmt = EvalStmt {
6198            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
6199            start: UNIX_EPOCH,
6200            end: UNIX_EPOCH
6201                .checked_add(Duration::from_secs(100_000))
6202                .unwrap(),
6203            interval: Duration::from_secs(5),
6204            lookback_delta: Duration::from_secs(1),
6205        };
6206
6207        let case = r#"
6208        sum(rate(sysstat{tenant_name=~"tenant1",cluster_name=~"cluster1"}[120s])) by (cluster_name,tenant_name) /
6209        (sum(sysstat{tenant_name=~"tenant1",cluster_name=~"cluster1"}) by (cluster_name,tenant_name) * 100)
6210            or
6211        200 * sum(sysstat{tenant_name=~"tenant1",cluster_name=~"cluster1"}) by (cluster_name,tenant_name) /
6212        sum(sysstat{tenant_name=~"tenant1",cluster_name=~"cluster1"}) by (cluster_name,tenant_name)"#;
6213
6214        let table_provider = build_test_table_provider_with_fields(
6215            &[(DEFAULT_SCHEMA_NAME.to_string(), "sysstat".to_string())],
6216            &["tenant_name", "cluster_name"],
6217        )
6218        .await;
6219        eval_stmt.expr = parser::parse(case).unwrap();
6220        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6221            .await
6222            .unwrap();
6223
6224        let case = r#"sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) /
6225            (sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) *1000) +
6226            sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) /
6227            (sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) *1000) >= 0
6228            or
6229            sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) /
6230            (sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) *1000) >= 0
6231            or
6232            sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) /
6233            (sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) *1000) >= 0"#;
6234        let table_provider = build_test_table_provider_with_fields(
6235            &[(DEFAULT_SCHEMA_NAME.to_string(), "sysstat".to_string())],
6236            &["tenant_name", "cluster_name"],
6237        )
6238        .await;
6239        eval_stmt.expr = parser::parse(case).unwrap();
6240        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6241            .await
6242            .unwrap();
6243
6244        let case = r#"(sum(background_waitevent_cnt{tenant_name=~"sys",cluster_name=~"cluster1"}) by (cluster_name,tenant_name) +
6245            sum(foreground_waitevent_cnt{tenant_name=~"sys",cluster_name=~"cluster1"}) by (cluster_name,tenant_name)) or
6246            (sum(background_waitevent_cnt{tenant_name=~"sys",cluster_name=~"cluster1"}) by (cluster_name,tenant_name)) or
6247            (sum(foreground_waitevent_cnt{tenant_name=~"sys",cluster_name=~"cluster1"}) by (cluster_name,tenant_name))"#;
6248        let table_provider = build_test_table_provider_with_fields(
6249            &[
6250                (
6251                    DEFAULT_SCHEMA_NAME.to_string(),
6252                    "background_waitevent_cnt".to_string(),
6253                ),
6254                (
6255                    DEFAULT_SCHEMA_NAME.to_string(),
6256                    "foreground_waitevent_cnt".to_string(),
6257                ),
6258            ],
6259            &["tenant_name", "cluster_name"],
6260        )
6261        .await;
6262        eval_stmt.expr = parser::parse(case).unwrap();
6263        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6264            .await
6265            .unwrap();
6266
6267        let case = r#"avg(node_load1{cluster_name=~"cluster1"}) by (cluster_name,host_name) or max(container_cpu_load_average_10s{cluster_name=~"cluster1"}) by (cluster_name,host_name) * 100 / max(container_spec_cpu_quota{cluster_name=~"cluster1"}) by (cluster_name,host_name)"#;
6268        let table_provider = build_test_table_provider_with_fields(
6269            &[
6270                (DEFAULT_SCHEMA_NAME.to_string(), "node_load1".to_string()),
6271                (
6272                    DEFAULT_SCHEMA_NAME.to_string(),
6273                    "container_cpu_load_average_10s".to_string(),
6274                ),
6275                (
6276                    DEFAULT_SCHEMA_NAME.to_string(),
6277                    "container_spec_cpu_quota".to_string(),
6278                ),
6279            ],
6280            &["cluster_name", "host_name"],
6281        )
6282        .await;
6283        eval_stmt.expr = parser::parse(case).unwrap();
6284        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6285            .await
6286            .unwrap();
6287    }
6288
6289    #[tokio::test]
6290    async fn value_matcher() {
6291        // template
6292        let mut eval_stmt = EvalStmt {
6293            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
6294            start: UNIX_EPOCH,
6295            end: UNIX_EPOCH
6296                .checked_add(Duration::from_secs(100_000))
6297                .unwrap(),
6298            interval: Duration::from_secs(5),
6299            lookback_delta: Duration::from_secs(1),
6300        };
6301
6302        let cases = [
6303            // single equal matcher
6304            (
6305                r#"some_metric{__field__="field_1"}"#,
6306                vec![
6307                    "some_metric.field_1",
6308                    "some_metric.tag_0",
6309                    "some_metric.tag_1",
6310                    "some_metric.tag_2",
6311                    "some_metric.timestamp",
6312                ],
6313            ),
6314            // two equal matchers
6315            (
6316                r#"some_metric{__field__="field_1", __field__="field_0"}"#,
6317                vec![
6318                    "some_metric.field_0",
6319                    "some_metric.field_1",
6320                    "some_metric.tag_0",
6321                    "some_metric.tag_1",
6322                    "some_metric.tag_2",
6323                    "some_metric.timestamp",
6324                ],
6325            ),
6326            // single not_eq matcher
6327            (
6328                r#"some_metric{__field__!="field_1"}"#,
6329                vec![
6330                    "some_metric.field_0",
6331                    "some_metric.field_2",
6332                    "some_metric.tag_0",
6333                    "some_metric.tag_1",
6334                    "some_metric.tag_2",
6335                    "some_metric.timestamp",
6336                ],
6337            ),
6338            // two not_eq matchers
6339            (
6340                r#"some_metric{__field__!="field_1", __field__!="field_2"}"#,
6341                vec![
6342                    "some_metric.field_0",
6343                    "some_metric.tag_0",
6344                    "some_metric.tag_1",
6345                    "some_metric.tag_2",
6346                    "some_metric.timestamp",
6347                ],
6348            ),
6349            // equal and not_eq matchers (no conflict)
6350            (
6351                r#"some_metric{__field__="field_1", __field__!="field_0"}"#,
6352                vec![
6353                    "some_metric.field_1",
6354                    "some_metric.tag_0",
6355                    "some_metric.tag_1",
6356                    "some_metric.tag_2",
6357                    "some_metric.timestamp",
6358                ],
6359            ),
6360            // equal and not_eq matchers (conflict)
6361            (
6362                r#"some_metric{__field__="field_2", __field__!="field_2"}"#,
6363                vec![
6364                    "some_metric.tag_0",
6365                    "some_metric.tag_1",
6366                    "some_metric.tag_2",
6367                    "some_metric.timestamp",
6368                ],
6369            ),
6370            // single regex eq matcher
6371            (
6372                r#"some_metric{__field__=~"field_1|field_2"}"#,
6373                vec![
6374                    "some_metric.field_1",
6375                    "some_metric.field_2",
6376                    "some_metric.tag_0",
6377                    "some_metric.tag_1",
6378                    "some_metric.tag_2",
6379                    "some_metric.timestamp",
6380                ],
6381            ),
6382            // single regex not_eq matcher
6383            (
6384                r#"some_metric{__field__!~"field_1|field_2"}"#,
6385                vec![
6386                    "some_metric.field_0",
6387                    "some_metric.tag_0",
6388                    "some_metric.tag_1",
6389                    "some_metric.tag_2",
6390                    "some_metric.timestamp",
6391                ],
6392            ),
6393        ];
6394
6395        for case in cases {
6396            let prom_expr = parser::parse(case.0).unwrap();
6397            eval_stmt.expr = prom_expr;
6398            let table_provider = build_test_table_provider(
6399                &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
6400                3,
6401                3,
6402            )
6403            .await;
6404            let plan =
6405                PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6406                    .await
6407                    .unwrap();
6408            let mut fields = plan.schema().field_names();
6409            let mut expected = case.1.into_iter().map(String::from).collect::<Vec<_>>();
6410            fields.sort();
6411            expected.sort();
6412            assert_eq!(fields, expected, "case: {:?}", case.0);
6413        }
6414
6415        let bad_cases = [
6416            r#"some_metric{__field__="nonexistent"}"#,
6417            r#"some_metric{__field__!="nonexistent"}"#,
6418        ];
6419
6420        for case in bad_cases {
6421            let prom_expr = parser::parse(case).unwrap();
6422            eval_stmt.expr = prom_expr;
6423            let table_provider = build_test_table_provider(
6424                &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
6425                3,
6426                3,
6427            )
6428            .await;
6429            let plan =
6430                PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6431                    .await;
6432            assert!(plan.is_err(), "case: {:?}", case);
6433        }
6434    }
6435
6436    #[tokio::test]
6437    async fn custom_schema() {
6438        let query = "some_alt_metric{__schema__=\"greptime_private\"}";
6439        let expected = String::from(
6440            "PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6441            \n  PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6442            \n    Sort: greptime_private.some_alt_metric.tag_0 ASC NULLS FIRST, greptime_private.some_alt_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6443            \n      Filter: greptime_private.some_alt_metric.timestamp >= TimestampMillisecond(-999, None) AND greptime_private.some_alt_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6444            \n        TableScan: greptime_private.some_alt_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
6445        );
6446
6447        indie_query_plan_compare(query, expected).await;
6448
6449        let query = "some_alt_metric{__database__=\"greptime_private\"}";
6450        let expected = String::from(
6451            "PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6452            \n  PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6453            \n    Sort: greptime_private.some_alt_metric.tag_0 ASC NULLS FIRST, greptime_private.some_alt_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6454            \n      Filter: greptime_private.some_alt_metric.timestamp >= TimestampMillisecond(-999, None) AND greptime_private.some_alt_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6455            \n        TableScan: greptime_private.some_alt_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
6456        );
6457
6458        indie_query_plan_compare(query, expected).await;
6459
6460        let query = "some_alt_metric{__schema__=\"greptime_private\"} / some_metric";
6461        let expected = String::from(
6462            "Projection: some_metric.tag_0, some_metric.timestamp, CAST(greptime_private.some_alt_metric.field_0 AS Float64) / CAST(some_metric.field_0 AS Float64) AS greptime_private.some_alt_metric.field_0 / some_metric.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), greptime_private.some_alt_metric.field_0 / some_metric.field_0:Float64;N]\
6463            \n  Inner Join: greptime_private.some_alt_metric.tag_0 = some_metric.tag_0, greptime_private.some_alt_metric.timestamp = some_metric.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6464            \n    SubqueryAlias: greptime_private.some_alt_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6465            \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6466            \n        PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6467            \n          Sort: greptime_private.some_alt_metric.tag_0 ASC NULLS FIRST, greptime_private.some_alt_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6468            \n            Filter: greptime_private.some_alt_metric.timestamp >= TimestampMillisecond(-999, None) AND greptime_private.some_alt_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6469            \n              TableScan: greptime_private.some_alt_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6470            \n    SubqueryAlias: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6471            \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6472            \n        PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6473            \n          Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6474            \n            Filter: some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6475            \n              TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
6476        );
6477
6478        indie_query_plan_compare(query, expected).await;
6479    }
6480
6481    #[tokio::test]
6482    async fn only_equals_is_supported_for_special_matcher() {
6483        let queries = &[
6484            "some_alt_metric{__schema__!=\"greptime_private\"}",
6485            "some_alt_metric{__schema__=~\"lalala\"}",
6486            "some_alt_metric{__database__!=\"greptime_private\"}",
6487            "some_alt_metric{__database__=~\"lalala\"}",
6488        ];
6489
6490        for query in queries {
6491            let prom_expr = parser::parse(query).unwrap();
6492            let eval_stmt = EvalStmt {
6493                expr: prom_expr,
6494                start: UNIX_EPOCH,
6495                end: UNIX_EPOCH
6496                    .checked_add(Duration::from_secs(100_000))
6497                    .unwrap(),
6498                interval: Duration::from_secs(5),
6499                lookback_delta: Duration::from_secs(1),
6500            };
6501
6502            let table_provider = build_test_table_provider(
6503                &[
6504                    (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
6505                    (
6506                        "greptime_private".to_string(),
6507                        "some_alt_metric".to_string(),
6508                    ),
6509                ],
6510                1,
6511                1,
6512            )
6513            .await;
6514
6515            let plan =
6516                PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6517                    .await;
6518            assert!(plan.is_err(), "query: {:?}", query);
6519        }
6520    }
6521
6522    #[tokio::test]
6523    async fn test_non_ms_precision() {
6524        let catalog_list = MemoryCatalogManager::with_default_setup();
6525        let columns = vec![
6526            ColumnSchema::new(
6527                "tag".to_string(),
6528                ConcreteDataType::string_datatype(),
6529                false,
6530            ),
6531            ColumnSchema::new(
6532                "timestamp".to_string(),
6533                ConcreteDataType::timestamp_nanosecond_datatype(),
6534                false,
6535            )
6536            .with_time_index(true),
6537            ColumnSchema::new(
6538                "field".to_string(),
6539                ConcreteDataType::float64_datatype(),
6540                true,
6541            ),
6542        ];
6543        let schema = Arc::new(Schema::new(columns));
6544        let table_meta = TableMetaBuilder::empty()
6545            .schema(schema)
6546            .primary_key_indices(vec![0])
6547            .value_indices(vec![2])
6548            .next_column_id(1024)
6549            .build()
6550            .unwrap();
6551        let table_info = TableInfoBuilder::default()
6552            .name("metrics".to_string())
6553            .meta(table_meta)
6554            .build()
6555            .unwrap();
6556        let table = EmptyTable::from_table_info(&table_info);
6557        assert!(
6558            catalog_list
6559                .register_table_sync(RegisterTableRequest {
6560                    catalog: DEFAULT_CATALOG_NAME.to_string(),
6561                    schema: DEFAULT_SCHEMA_NAME.to_string(),
6562                    table_name: "metrics".to_string(),
6563                    table_id: 1024,
6564                    table,
6565                })
6566                .is_ok()
6567        );
6568
6569        let plan = PromPlanner::stmt_to_plan(
6570            DfTableSourceProvider::new(
6571                catalog_list.clone(),
6572                false,
6573                QueryContext::arc(),
6574                DummyDecoder::arc(),
6575                true,
6576            ),
6577            &EvalStmt {
6578                expr: parser::parse("metrics{tag = \"1\"}").unwrap(),
6579                start: UNIX_EPOCH,
6580                end: UNIX_EPOCH
6581                    .checked_add(Duration::from_secs(100_000))
6582                    .unwrap(),
6583                interval: Duration::from_secs(5),
6584                lookback_delta: Duration::from_secs(1),
6585            },
6586            &build_query_engine_state(),
6587        )
6588        .await
6589        .unwrap();
6590        assert_eq!(
6591            plan.display_indent_schema().to_string(),
6592            "PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
6593            \n  PromSeriesDivide: tags=[\"tag\"] [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
6594            \n    Sort: metrics.tag ASC NULLS FIRST, metrics.timestamp ASC NULLS FIRST [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
6595            \n      Filter: metrics.tag = Utf8(\"1\") AND metrics.timestamp >= TimestampMillisecond(-999, None) AND metrics.timestamp <= TimestampMillisecond(100000000, None) [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
6596            \n        Projection: metrics.field, metrics.tag, CAST(metrics.timestamp AS Timestamp(ms)) AS timestamp [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
6597            \n          TableScan: metrics [tag:Utf8, timestamp:Timestamp(ns), field:Float64;N]"
6598        );
6599        let plan = PromPlanner::stmt_to_plan(
6600            DfTableSourceProvider::new(
6601                catalog_list.clone(),
6602                false,
6603                QueryContext::arc(),
6604                DummyDecoder::arc(),
6605                true,
6606            ),
6607            &EvalStmt {
6608                expr: parser::parse("avg_over_time(metrics{tag = \"1\"}[5s])").unwrap(),
6609                start: UNIX_EPOCH,
6610                end: UNIX_EPOCH
6611                    .checked_add(Duration::from_secs(100_000))
6612                    .unwrap(),
6613                interval: Duration::from_secs(5),
6614                lookback_delta: Duration::from_secs(1),
6615            },
6616            &build_query_engine_state(),
6617        )
6618        .await
6619        .unwrap();
6620        assert_eq!(
6621            plan.display_indent_schema().to_string(),
6622            "Filter: prom_avg_over_time(timestamp_range,field) IS NOT NULL [timestamp:Timestamp(ms), prom_avg_over_time(timestamp_range,field):Float64;N, tag:Utf8]\
6623            \n  Projection: metrics.timestamp, prom_avg_over_time(timestamp_range, field) AS prom_avg_over_time(timestamp_range,field), metrics.tag [timestamp:Timestamp(ms), prom_avg_over_time(timestamp_range,field):Float64;N, tag:Utf8]\
6624            \n    PromRangeManipulate: req range=[0..100000000], interval=[5000], eval range=[5000], time index=[timestamp], values=[\"field\"] [field:Dictionary(Int64, Float64);N, tag:Utf8, timestamp:Timestamp(ms), timestamp_range:Dictionary(Int64, Timestamp(ms))]\
6625            \n      PromSeriesNormalize: offset=[0], time index=[timestamp], filter NaN: [true] [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
6626            \n        PromSeriesDivide: tags=[\"tag\"] [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
6627            \n          Sort: metrics.tag ASC NULLS FIRST, metrics.timestamp ASC NULLS FIRST [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
6628            \n            Filter: metrics.tag = Utf8(\"1\") AND metrics.timestamp >= TimestampMillisecond(-4999, None) AND metrics.timestamp <= TimestampMillisecond(100000000, None) [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
6629            \n              Projection: metrics.field, metrics.tag, CAST(metrics.timestamp AS Timestamp(ms)) AS timestamp [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
6630            \n                TableScan: metrics [tag:Utf8, timestamp:Timestamp(ns), field:Float64;N]"
6631        );
6632    }
6633
6634    #[tokio::test]
6635    async fn test_nonexistent_label() {
6636        // template
6637        let mut eval_stmt = EvalStmt {
6638            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
6639            start: UNIX_EPOCH,
6640            end: UNIX_EPOCH
6641                .checked_add(Duration::from_secs(100_000))
6642                .unwrap(),
6643            interval: Duration::from_secs(5),
6644            lookback_delta: Duration::from_secs(1),
6645        };
6646
6647        let case = r#"some_metric{nonexistent="hi"}"#;
6648        let prom_expr = parser::parse(case).unwrap();
6649        eval_stmt.expr = prom_expr;
6650        let table_provider = build_test_table_provider(
6651            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
6652            3,
6653            3,
6654        )
6655        .await;
6656        // Should be ok
6657        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6658            .await
6659            .unwrap();
6660    }
6661
6662    #[tokio::test]
6663    async fn test_label_join() {
6664        let prom_expr = parser::parse(
6665            "label_join(up{tag_0='api-server'}, 'foo', ',', 'tag_1', 'tag_2', 'tag_3')",
6666        )
6667        .unwrap();
6668        let eval_stmt = EvalStmt {
6669            expr: prom_expr,
6670            start: UNIX_EPOCH,
6671            end: UNIX_EPOCH
6672                .checked_add(Duration::from_secs(100_000))
6673                .unwrap(),
6674            interval: Duration::from_secs(5),
6675            lookback_delta: Duration::from_secs(1),
6676        };
6677
6678        let table_provider =
6679            build_test_table_provider(&[(DEFAULT_SCHEMA_NAME.to_string(), "up".to_string())], 4, 1)
6680                .await;
6681        let plan =
6682            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6683                .await
6684                .unwrap();
6685
6686        let expected = r#"
6687Filter: up.field_0 IS NOT NULL [timestamp:Timestamp(ms), field_0:Float64;N, foo:Utf8;N, tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8]
6688  Projection: up.timestamp, up.field_0, concat_ws(Utf8(","), up.tag_1, up.tag_2, up.tag_3) AS foo, up.tag_0, up.tag_1, up.tag_2, up.tag_3 [timestamp:Timestamp(ms), field_0:Float64;N, foo:Utf8;N, tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8]
6689    PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
6690      PromSeriesDivide: tags=["tag_0", "tag_1", "tag_2", "tag_3"] [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
6691        Sort: up.tag_0 ASC NULLS FIRST, up.tag_1 ASC NULLS FIRST, up.tag_2 ASC NULLS FIRST, up.tag_3 ASC NULLS FIRST, up.timestamp ASC NULLS FIRST [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
6692          Filter: up.tag_0 = Utf8("api-server") AND up.timestamp >= TimestampMillisecond(-999, None) AND up.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
6693            TableScan: up [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]"#;
6694
6695        let ret = plan.display_indent_schema().to_string();
6696        assert_eq!(format!("\n{ret}"), expected, "\n{}", ret);
6697    }
6698
6699    #[tokio::test]
6700    async fn test_label_replace() {
6701        let prom_expr = parser::parse(
6702            "label_replace(up{tag_0=\"a:c\"}, \"foo\", \"$1\", \"tag_0\", \"(.*):.*\")",
6703        )
6704        .unwrap();
6705        let eval_stmt = EvalStmt {
6706            expr: prom_expr,
6707            start: UNIX_EPOCH,
6708            end: UNIX_EPOCH
6709                .checked_add(Duration::from_secs(100_000))
6710                .unwrap(),
6711            interval: Duration::from_secs(5),
6712            lookback_delta: Duration::from_secs(1),
6713        };
6714
6715        let table_provider =
6716            build_test_table_provider(&[(DEFAULT_SCHEMA_NAME.to_string(), "up".to_string())], 1, 1)
6717                .await;
6718        let plan =
6719            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6720                .await
6721                .unwrap();
6722
6723        let expected = r#"
6724Filter: up.field_0 IS NOT NULL [timestamp:Timestamp(ms), field_0:Float64;N, foo:Utf8;N, tag_0:Utf8]
6725  Projection: up.timestamp, up.field_0, regexp_replace(up.tag_0, Utf8("^(?s:(.*):.*)$"), Utf8("$1")) AS foo, up.tag_0 [timestamp:Timestamp(ms), field_0:Float64;N, foo:Utf8;N, tag_0:Utf8]
6726    PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
6727      PromSeriesDivide: tags=["tag_0"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
6728        Sort: up.tag_0 ASC NULLS FIRST, up.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
6729          Filter: up.tag_0 = Utf8("a:c") AND up.timestamp >= TimestampMillisecond(-999, None) AND up.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
6730            TableScan: up [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]"#;
6731
6732        let ret = plan.display_indent_schema().to_string();
6733        assert_eq!(format!("\n{ret}"), expected, "\n{}", ret);
6734    }
6735
6736    #[tokio::test]
6737    async fn test_matchers_to_expr() {
6738        let mut eval_stmt = EvalStmt {
6739            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
6740            start: UNIX_EPOCH,
6741            end: UNIX_EPOCH
6742                .checked_add(Duration::from_secs(100_000))
6743                .unwrap(),
6744            interval: Duration::from_secs(5),
6745            lookback_delta: Duration::from_secs(1),
6746        };
6747        let case =
6748            r#"sum(prometheus_tsdb_head_series{tag_1=~"(10.0.160.237:8080|10.0.160.237:9090)"})"#;
6749
6750        let prom_expr = parser::parse(case).unwrap();
6751        eval_stmt.expr = prom_expr;
6752        let table_provider = build_test_table_provider(
6753            &[(
6754                DEFAULT_SCHEMA_NAME.to_string(),
6755                "prometheus_tsdb_head_series".to_string(),
6756            )],
6757            3,
6758            3,
6759        )
6760        .await;
6761        let plan =
6762            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6763                .await
6764                .unwrap();
6765        let expected = "Sort: prometheus_tsdb_head_series.timestamp ASC NULLS LAST [timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.field_0):Float64;N, sum(prometheus_tsdb_head_series.field_1):Float64;N, sum(prometheus_tsdb_head_series.field_2):Float64;N]\
6766        \n  Aggregate: groupBy=[[prometheus_tsdb_head_series.timestamp]], aggr=[[sum(prometheus_tsdb_head_series.field_0), sum(prometheus_tsdb_head_series.field_1), sum(prometheus_tsdb_head_series.field_2)]] [timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.field_0):Float64;N, sum(prometheus_tsdb_head_series.field_1):Float64;N, sum(prometheus_tsdb_head_series.field_2):Float64;N]\
6767        \n    PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N]\
6768        \n      PromSeriesDivide: tags=[\"tag_0\", \"tag_1\", \"tag_2\"] [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N]\
6769        \n        Sort: prometheus_tsdb_head_series.tag_0 ASC NULLS FIRST, prometheus_tsdb_head_series.tag_1 ASC NULLS FIRST, prometheus_tsdb_head_series.tag_2 ASC NULLS FIRST, prometheus_tsdb_head_series.timestamp ASC NULLS FIRST [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N]\
6770        \n          Filter: prometheus_tsdb_head_series.tag_1 ~ Utf8(\"^(?:(10.0.160.237:8080|10.0.160.237:9090))$\") AND prometheus_tsdb_head_series.timestamp >= TimestampMillisecond(-999, None) AND prometheus_tsdb_head_series.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N]\
6771        \n            TableScan: prometheus_tsdb_head_series [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N]";
6772        assert_eq!(plan.display_indent_schema().to_string(), expected);
6773    }
6774
6775    #[tokio::test]
6776    async fn test_topk_expr() {
6777        let mut eval_stmt = EvalStmt {
6778            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
6779            start: UNIX_EPOCH,
6780            end: UNIX_EPOCH
6781                .checked_add(Duration::from_secs(100_000))
6782                .unwrap(),
6783            interval: Duration::from_secs(5),
6784            lookback_delta: Duration::from_secs(1),
6785        };
6786        let case = r#"topk(10, sum(prometheus_tsdb_head_series{ip=~"(10.0.160.237:8080|10.0.160.237:9090)"}) by (ip))"#;
6787
6788        let prom_expr = parser::parse(case).unwrap();
6789        eval_stmt.expr = prom_expr;
6790        let table_provider = build_test_table_provider_with_fields(
6791            &[
6792                (
6793                    DEFAULT_SCHEMA_NAME.to_string(),
6794                    "prometheus_tsdb_head_series".to_string(),
6795                ),
6796                (
6797                    DEFAULT_SCHEMA_NAME.to_string(),
6798                    "http_server_requests_seconds_count".to_string(),
6799                ),
6800            ],
6801            &["ip"],
6802        )
6803        .await;
6804
6805        let plan =
6806            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6807                .await
6808                .unwrap();
6809        let expected = "Projection: sum(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp [sum(prometheus_tsdb_head_series.greptime_value):Float64;N, ip:Utf8, greptime_timestamp:Timestamp(ms)]\
6810        \n  Sort: prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST, row_number() PARTITION BY [prometheus_tsdb_head_series.greptime_timestamp] ORDER BY [sum(prometheus_tsdb_head_series.greptime_value) DESC NULLS FIRST, prometheus_tsdb_head_series.ip DESC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ASC NULLS LAST [ip:Utf8, greptime_timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.greptime_value):Float64;N, row_number() PARTITION BY [prometheus_tsdb_head_series.greptime_timestamp] ORDER BY [sum(prometheus_tsdb_head_series.greptime_value) DESC NULLS FIRST, prometheus_tsdb_head_series.ip DESC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64]\
6811        \n    Filter: row_number() PARTITION BY [prometheus_tsdb_head_series.greptime_timestamp] ORDER BY [sum(prometheus_tsdb_head_series.greptime_value) DESC NULLS FIRST, prometheus_tsdb_head_series.ip DESC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Float64(10) [ip:Utf8, greptime_timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.greptime_value):Float64;N, row_number() PARTITION BY [prometheus_tsdb_head_series.greptime_timestamp] ORDER BY [sum(prometheus_tsdb_head_series.greptime_value) DESC NULLS FIRST, prometheus_tsdb_head_series.ip DESC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64]\
6812        \n      WindowAggr: windowExpr=[[row_number() PARTITION BY [prometheus_tsdb_head_series.greptime_timestamp] ORDER BY [sum(prometheus_tsdb_head_series.greptime_value) DESC NULLS FIRST, prometheus_tsdb_head_series.ip DESC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [ip:Utf8, greptime_timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.greptime_value):Float64;N, row_number() PARTITION BY [prometheus_tsdb_head_series.greptime_timestamp] ORDER BY [sum(prometheus_tsdb_head_series.greptime_value) DESC NULLS FIRST, prometheus_tsdb_head_series.ip DESC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64]\
6813        \n        Sort: prometheus_tsdb_head_series.ip ASC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST [ip:Utf8, greptime_timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.greptime_value):Float64;N]\
6814        \n          Aggregate: groupBy=[[prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp]], aggr=[[sum(prometheus_tsdb_head_series.greptime_value)]] [ip:Utf8, greptime_timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.greptime_value):Float64;N]\
6815        \n            PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
6816        \n              PromSeriesDivide: tags=[\"ip\"] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
6817        \n                Sort: prometheus_tsdb_head_series.ip ASC NULLS FIRST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS FIRST [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
6818        \n                  Filter: prometheus_tsdb_head_series.ip ~ Utf8(\"^(?:(10.0.160.237:8080|10.0.160.237:9090))$\") AND prometheus_tsdb_head_series.greptime_timestamp >= TimestampMillisecond(-999, None) AND prometheus_tsdb_head_series.greptime_timestamp <= TimestampMillisecond(100000000, None) [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
6819        \n                    TableScan: prometheus_tsdb_head_series [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]";
6820
6821        assert_eq!(plan.display_indent_schema().to_string(), expected);
6822    }
6823
6824    #[tokio::test]
6825    async fn test_count_values_expr() {
6826        let mut eval_stmt = EvalStmt {
6827            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
6828            start: UNIX_EPOCH,
6829            end: UNIX_EPOCH
6830                .checked_add(Duration::from_secs(100_000))
6831                .unwrap(),
6832            interval: Duration::from_secs(5),
6833            lookback_delta: Duration::from_secs(1),
6834        };
6835        let case = r#"count_values('series', prometheus_tsdb_head_series{ip=~"(10.0.160.237:8080|10.0.160.237:9090)"}) by (ip)"#;
6836
6837        let prom_expr = parser::parse(case).unwrap();
6838        eval_stmt.expr = prom_expr;
6839        let table_provider = build_test_table_provider_with_fields(
6840            &[
6841                (
6842                    DEFAULT_SCHEMA_NAME.to_string(),
6843                    "prometheus_tsdb_head_series".to_string(),
6844                ),
6845                (
6846                    DEFAULT_SCHEMA_NAME.to_string(),
6847                    "http_server_requests_seconds_count".to_string(),
6848                ),
6849            ],
6850            &["ip"],
6851        )
6852        .await;
6853
6854        let plan =
6855            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6856                .await
6857                .unwrap();
6858        let expected = "Projection: count(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, series [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(ms), series:Float64;N]\
6859        \n  Sort: prometheus_tsdb_head_series.ip ASC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST, prometheus_tsdb_head_series.greptime_value ASC NULLS LAST [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(ms), series:Float64;N, greptime_value:Float64;N]\
6860        \n    Projection: count(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, prometheus_tsdb_head_series.greptime_value AS series, prometheus_tsdb_head_series.greptime_value [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(ms), series:Float64;N, greptime_value:Float64;N]\
6861        \n      Aggregate: groupBy=[[prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, prometheus_tsdb_head_series.greptime_value]], aggr=[[count(prometheus_tsdb_head_series.greptime_value)]] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N, count(prometheus_tsdb_head_series.greptime_value):Int64]\
6862        \n        PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
6863        \n          PromSeriesDivide: tags=[\"ip\"] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
6864        \n            Sort: prometheus_tsdb_head_series.ip ASC NULLS FIRST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS FIRST [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
6865        \n              Filter: prometheus_tsdb_head_series.ip ~ Utf8(\"^(?:(10.0.160.237:8080|10.0.160.237:9090))$\") AND prometheus_tsdb_head_series.greptime_timestamp >= TimestampMillisecond(-999, None) AND prometheus_tsdb_head_series.greptime_timestamp <= TimestampMillisecond(100000000, None) [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
6866        \n                TableScan: prometheus_tsdb_head_series [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]";
6867
6868        assert_eq!(plan.display_indent_schema().to_string(), expected);
6869    }
6870
6871    #[tokio::test]
6872    async fn test_value_alias() {
6873        let mut eval_stmt = EvalStmt {
6874            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
6875            start: UNIX_EPOCH,
6876            end: UNIX_EPOCH
6877                .checked_add(Duration::from_secs(100_000))
6878                .unwrap(),
6879            interval: Duration::from_secs(5),
6880            lookback_delta: Duration::from_secs(1),
6881        };
6882        let case = r#"count_values('series', prometheus_tsdb_head_series{ip=~"(10.0.160.237:8080|10.0.160.237:9090)"}) by (ip)"#;
6883
6884        let prom_expr = parser::parse(case).unwrap();
6885        eval_stmt.expr = prom_expr;
6886        eval_stmt = QueryLanguageParser::apply_alias_extension(eval_stmt, "my_series");
6887        let table_provider = build_test_table_provider_with_fields(
6888            &[
6889                (
6890                    DEFAULT_SCHEMA_NAME.to_string(),
6891                    "prometheus_tsdb_head_series".to_string(),
6892                ),
6893                (
6894                    DEFAULT_SCHEMA_NAME.to_string(),
6895                    "http_server_requests_seconds_count".to_string(),
6896                ),
6897            ],
6898            &["ip"],
6899        )
6900        .await;
6901
6902        let plan =
6903            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6904                .await
6905                .unwrap();
6906        let expected = r#"
6907Projection: count(prometheus_tsdb_head_series.greptime_value) AS my_series, prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp [my_series:Int64, ip:Utf8, greptime_timestamp:Timestamp(ms)]
6908  Projection: count(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, series [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(ms), series:Float64;N]
6909    Sort: prometheus_tsdb_head_series.ip ASC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST, prometheus_tsdb_head_series.greptime_value ASC NULLS LAST [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(ms), series:Float64;N, greptime_value:Float64;N]
6910      Projection: count(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, prometheus_tsdb_head_series.greptime_value AS series, prometheus_tsdb_head_series.greptime_value [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(ms), series:Float64;N, greptime_value:Float64;N]
6911        Aggregate: groupBy=[[prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, prometheus_tsdb_head_series.greptime_value]], aggr=[[count(prometheus_tsdb_head_series.greptime_value)]] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N, count(prometheus_tsdb_head_series.greptime_value):Int64]
6912          PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
6913            PromSeriesDivide: tags=["ip"] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
6914              Sort: prometheus_tsdb_head_series.ip ASC NULLS FIRST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS FIRST [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
6915                Filter: prometheus_tsdb_head_series.ip ~ Utf8("^(?:(10.0.160.237:8080|10.0.160.237:9090))$") AND prometheus_tsdb_head_series.greptime_timestamp >= TimestampMillisecond(-999, None) AND prometheus_tsdb_head_series.greptime_timestamp <= TimestampMillisecond(100000000, None) [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
6916                  TableScan: prometheus_tsdb_head_series [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]"#;
6917        assert_eq!(format!("\n{}", plan.display_indent_schema()), expected);
6918    }
6919
6920    #[tokio::test]
6921    async fn test_quantile_expr() {
6922        let mut eval_stmt = EvalStmt {
6923            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
6924            start: UNIX_EPOCH,
6925            end: UNIX_EPOCH
6926                .checked_add(Duration::from_secs(100_000))
6927                .unwrap(),
6928            interval: Duration::from_secs(5),
6929            lookback_delta: Duration::from_secs(1),
6930        };
6931        let case = r#"quantile(0.3, sum(prometheus_tsdb_head_series{ip=~"(10.0.160.237:8080|10.0.160.237:9090)"}) by (ip))"#;
6932
6933        let prom_expr = parser::parse(case).unwrap();
6934        eval_stmt.expr = prom_expr;
6935        let table_provider = build_test_table_provider_with_fields(
6936            &[
6937                (
6938                    DEFAULT_SCHEMA_NAME.to_string(),
6939                    "prometheus_tsdb_head_series".to_string(),
6940                ),
6941                (
6942                    DEFAULT_SCHEMA_NAME.to_string(),
6943                    "http_server_requests_seconds_count".to_string(),
6944                ),
6945            ],
6946            &["ip"],
6947        )
6948        .await;
6949
6950        let plan =
6951            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6952                .await
6953                .unwrap();
6954        let expected = "Sort: prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST [greptime_timestamp:Timestamp(ms), quantile(Float64(0.3),sum(prometheus_tsdb_head_series.greptime_value)):Float64;N]\
6955        \n  Aggregate: groupBy=[[prometheus_tsdb_head_series.greptime_timestamp]], aggr=[[quantile(Float64(0.3), sum(prometheus_tsdb_head_series.greptime_value))]] [greptime_timestamp:Timestamp(ms), quantile(Float64(0.3),sum(prometheus_tsdb_head_series.greptime_value)):Float64;N]\
6956        \n    Sort: prometheus_tsdb_head_series.ip ASC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST [ip:Utf8, greptime_timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.greptime_value):Float64;N]\
6957        \n      Aggregate: groupBy=[[prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp]], aggr=[[sum(prometheus_tsdb_head_series.greptime_value)]] [ip:Utf8, greptime_timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.greptime_value):Float64;N]\
6958        \n        PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
6959        \n          PromSeriesDivide: tags=[\"ip\"] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
6960        \n            Sort: prometheus_tsdb_head_series.ip ASC NULLS FIRST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS FIRST [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
6961        \n              Filter: prometheus_tsdb_head_series.ip ~ Utf8(\"^(?:(10.0.160.237:8080|10.0.160.237:9090))$\") AND prometheus_tsdb_head_series.greptime_timestamp >= TimestampMillisecond(-999, None) AND prometheus_tsdb_head_series.greptime_timestamp <= TimestampMillisecond(100000000, None) [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
6962        \n                TableScan: prometheus_tsdb_head_series [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]";
6963
6964        assert_eq!(plan.display_indent_schema().to_string(), expected);
6965    }
6966
6967    #[tokio::test]
6968    async fn test_or_not_exists_table_label() {
6969        let mut eval_stmt = EvalStmt {
6970            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
6971            start: UNIX_EPOCH,
6972            end: UNIX_EPOCH
6973                .checked_add(Duration::from_secs(100_000))
6974                .unwrap(),
6975            interval: Duration::from_secs(5),
6976            lookback_delta: Duration::from_secs(1),
6977        };
6978        let case = r#"sum by (job, tag0, tag2) (metric_exists) or sum by (job, tag0, tag2) (metric_not_exists)"#;
6979
6980        let prom_expr = parser::parse(case).unwrap();
6981        eval_stmt.expr = prom_expr;
6982        let table_provider = build_test_table_provider_with_fields(
6983            &[(DEFAULT_SCHEMA_NAME.to_string(), "metric_exists".to_string())],
6984            &["job"],
6985        )
6986        .await;
6987
6988        let plan =
6989            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6990                .await
6991                .unwrap();
6992        let expected = r#"UnionDistinctOn: on col=[["job"]], ts_col=[greptime_timestamp] [greptime_timestamp:Timestamp(ms), job:Utf8, sum(metric_exists.greptime_value):Float64;N]
6993  SubqueryAlias: metric_exists [greptime_timestamp:Timestamp(ms), job:Utf8, sum(metric_exists.greptime_value):Float64;N]
6994    Projection: metric_exists.greptime_timestamp, metric_exists.job, sum(metric_exists.greptime_value) [greptime_timestamp:Timestamp(ms), job:Utf8, sum(metric_exists.greptime_value):Float64;N]
6995      Sort: metric_exists.job ASC NULLS LAST, metric_exists.greptime_timestamp ASC NULLS LAST [job:Utf8, greptime_timestamp:Timestamp(ms), sum(metric_exists.greptime_value):Float64;N]
6996        Aggregate: groupBy=[[metric_exists.job, metric_exists.greptime_timestamp]], aggr=[[sum(metric_exists.greptime_value)]] [job:Utf8, greptime_timestamp:Timestamp(ms), sum(metric_exists.greptime_value):Float64;N]
6997          PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [job:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
6998            PromSeriesDivide: tags=["job"] [job:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
6999              Sort: metric_exists.job ASC NULLS FIRST, metric_exists.greptime_timestamp ASC NULLS FIRST [job:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
7000                Filter: metric_exists.greptime_timestamp >= TimestampMillisecond(-999, None) AND metric_exists.greptime_timestamp <= TimestampMillisecond(100000000, None) [job:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
7001                  TableScan: metric_exists [job:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
7002  SubqueryAlias:  [greptime_timestamp:Timestamp(ms), job:Utf8;N, sum(.value):Float64;N]
7003    Projection: .time AS greptime_timestamp, Utf8(NULL) AS job, sum(.value) [greptime_timestamp:Timestamp(ms), job:Utf8;N, sum(.value):Float64;N]
7004      Sort: .time ASC NULLS LAST [time:Timestamp(ms), sum(.value):Float64;N]
7005        Aggregate: groupBy=[[.time]], aggr=[[sum(.value)]] [time:Timestamp(ms), sum(.value):Float64;N]
7006          EmptyMetric: range=[0..-1], interval=[5000] [time:Timestamp(ms), value:Float64;N]
7007            TableScan: dummy [time:Timestamp(ms), value:Float64;N]"#;
7008
7009        assert_eq!(plan.display_indent_schema().to_string(), expected);
7010    }
7011
7012    #[tokio::test]
7013    async fn test_histogram_quantile_missing_le_column() {
7014        let mut eval_stmt = EvalStmt {
7015            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
7016            start: UNIX_EPOCH,
7017            end: UNIX_EPOCH
7018                .checked_add(Duration::from_secs(100_000))
7019                .unwrap(),
7020            interval: Duration::from_secs(5),
7021            lookback_delta: Duration::from_secs(1),
7022        };
7023
7024        // Test case: histogram_quantile with a table that doesn't have 'le' column
7025        let case = r#"histogram_quantile(0.99, sum by(pod,instance,le) (rate(non_existent_histogram_bucket{instance=~"xxx"}[1m])))"#;
7026
7027        let prom_expr = parser::parse(case).unwrap();
7028        eval_stmt.expr = prom_expr;
7029
7030        // Create a table provider with a table that doesn't have 'le' column
7031        let table_provider = build_test_table_provider_with_fields(
7032            &[(
7033                DEFAULT_SCHEMA_NAME.to_string(),
7034                "non_existent_histogram_bucket".to_string(),
7035            )],
7036            &["pod", "instance"], // Note: no 'le' column
7037        )
7038        .await;
7039
7040        // Should return empty result instead of error
7041        let result =
7042            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7043                .await;
7044
7045        // This should succeed now (returning empty result) instead of failing with "Cannot find column le"
7046        assert!(
7047            result.is_ok(),
7048            "Expected successful plan creation with empty result, but got error: {:?}",
7049            result.err()
7050        );
7051
7052        // Verify that the result is an EmptyRelation
7053        let plan = result.unwrap();
7054        match plan {
7055            LogicalPlan::EmptyRelation(_) => {
7056                // This is what we expect
7057            }
7058            _ => panic!("Expected EmptyRelation, but got: {:?}", plan),
7059        }
7060    }
7061}