Skip to main content

flow/batching_mode/
task.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::collections::{BTreeSet, HashMap, HashSet};
16use std::sync::{Arc, RwLock};
17use std::time::{Duration, SystemTime, UNIX_EPOCH};
18
19use api::v1::CreateTableExpr;
20use catalog::CatalogManagerRef;
21use common_error::ext::BoxedError;
22use common_query::logical_plan::breakup_insert_plan;
23use common_telemetry::tracing::warn;
24use common_telemetry::{debug, info};
25use common_time::Timestamp;
26use datafusion::datasource::DefaultTableSource;
27use datafusion::sql::unparser::expr_to_sql;
28use datafusion_common::DFSchemaRef;
29use datafusion_common::tree_node::{Transformed, TreeNode};
30use datafusion_expr::{DmlStatement, LogicalPlan, WriteOp};
31use datatypes::schema::Schema;
32use query::QueryEngineRef;
33use query::query_engine::DefaultSerializer;
34use session::context::QueryContextRef;
35use snafu::{OptionExt, ResultExt};
36use sql::parsers::utils::is_tql;
37use store_api::mito_engine_options::MERGE_MODE_KEY;
38use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
39use table::table::adapter::DfTableProviderAdapter;
40use tokio::sync::oneshot;
41use tokio::sync::oneshot::error::TryRecvError;
42use tokio::time::Instant;
43
44use crate::batching_mode::BatchingModeOptions;
45use crate::batching_mode::frontend_client::FrontendClient;
46use crate::batching_mode::state::{DirtyTimeWindows, FilterExprInfo, TaskState};
47use crate::batching_mode::table_creator::{QueryType, create_table_with_expr};
48use crate::batching_mode::time_window::TimeWindowExpr;
49use crate::batching_mode::utils::{
50    AddFilterRewriter, ColumnMatcherRewriter, gen_plan_with_matching_schema,
51    get_table_info_df_schema, sql_to_df_plan,
52};
53use crate::df_optimizer::apply_df_optimizer;
54use crate::error::{
55    DatafusionSnafu, ExternalSnafu, InvalidQuerySnafu, SubstraitEncodeLogicalPlanSnafu,
56    UnexpectedSnafu,
57};
58use crate::metrics::{
59    METRIC_FLOW_BATCHING_ENGINE_ERROR_CNT, METRIC_FLOW_BATCHING_ENGINE_QUERY_TIME,
60    METRIC_FLOW_BATCHING_ENGINE_SLOW_QUERY, METRIC_FLOW_BATCHING_ENGINE_START_QUERY_CNT,
61    METRIC_FLOW_ROWS,
62};
63use crate::{Error, FlowId};
64
65/// The task's config, immutable once created
66#[derive(Clone)]
67pub struct TaskConfig {
68    pub flow_id: FlowId,
69    pub query: String,
70    /// output schema of the query
71    pub output_schema: DFSchemaRef,
72    pub time_window_expr: Option<TimeWindowExpr>,
73    /// in seconds
74    pub expire_after: Option<i64>,
75    pub sink_table_name: [String; 3],
76    pub source_table_names: HashSet<[String; 3]>,
77    pub catalog_manager: CatalogManagerRef,
78    pub query_type: QueryType,
79    pub batch_opts: Arc<BatchingModeOptions>,
80    pub flow_eval_interval: Option<Duration>,
81}
82
83fn determine_query_type(query: &str, query_ctx: &QueryContextRef) -> Result<QueryType, Error> {
84    let is_tql = is_tql(query_ctx.sql_dialect(), query)
85        .map_err(BoxedError::new)
86        .context(ExternalSnafu)?;
87    Ok(if is_tql {
88        QueryType::Tql
89    } else {
90        QueryType::Sql
91    })
92}
93
94fn is_merge_mode_last_non_null(options: &HashMap<String, String>) -> bool {
95    options
96        .get(MERGE_MODE_KEY)
97        .map(|mode| mode.eq_ignore_ascii_case("last_non_null"))
98        .unwrap_or(false)
99}
100
101#[derive(Clone)]
102pub struct BatchingTask {
103    pub config: Arc<TaskConfig>,
104    pub state: Arc<RwLock<TaskState>>,
105}
106
107/// Arguments for creating batching task
108pub struct TaskArgs<'a> {
109    pub flow_id: FlowId,
110    pub query: &'a str,
111    pub plan: LogicalPlan,
112    pub time_window_expr: Option<TimeWindowExpr>,
113    pub expire_after: Option<i64>,
114    pub sink_table_name: [String; 3],
115    pub source_table_names: Vec<[String; 3]>,
116    pub query_ctx: QueryContextRef,
117    pub catalog_manager: CatalogManagerRef,
118    pub shutdown_rx: oneshot::Receiver<()>,
119    pub batch_opts: Arc<BatchingModeOptions>,
120    pub flow_eval_interval: Option<Duration>,
121}
122
123pub struct PlanInfo {
124    pub plan: LogicalPlan,
125    pub dirty_restore: DirtyRestore,
126}
127
128pub enum DirtyRestore {
129    /// The query was scoped to dirty time ranges; restore those ranges if the
130    /// run fails.
131    Scoped(FilterExprInfo),
132    /// The query could not be scoped to dirty time ranges, so the dirty-window
133    /// state is only a dirty signal. Restore the consumed signal if the full
134    /// run fails.
135    ///
136    /// TODO(discord9): Full-query runs only need a dirty bool flag. Refactor
137    /// the unscoped path to stop reusing `DirtyTimeWindows` for this signal.
138    Unscoped(DirtyTimeWindows),
139}
140
141impl BatchingTask {
142    #[allow(clippy::too_many_arguments)]
143    pub fn try_new(
144        TaskArgs {
145            flow_id,
146            query,
147            plan,
148            time_window_expr,
149            expire_after,
150            sink_table_name,
151            source_table_names,
152            query_ctx,
153            catalog_manager,
154            shutdown_rx,
155            batch_opts,
156            flow_eval_interval,
157        }: TaskArgs<'_>,
158    ) -> Result<Self, Error> {
159        Ok(Self {
160            config: Arc::new(TaskConfig {
161                flow_id,
162                query: query.to_string(),
163                time_window_expr,
164                expire_after,
165                sink_table_name,
166                source_table_names: source_table_names.into_iter().collect(),
167                catalog_manager,
168                output_schema: plan.schema().clone(),
169                query_type: determine_query_type(query, &query_ctx)?,
170                batch_opts,
171                flow_eval_interval,
172            }),
173            state: Arc::new(RwLock::new(TaskState::new(query_ctx, shutdown_rx))),
174        })
175    }
176
177    pub fn last_execution_time_millis(&self) -> Option<i64> {
178        self.state.read().unwrap().last_execution_time_millis()
179    }
180
181    /// mark time window range (now - expire_after, now) as dirty (or (0, now) if expire_after not set)
182    ///
183    /// useful for flush_flow to flush dirty time windows range
184    pub fn mark_all_windows_as_dirty(&self) -> Result<(), Error> {
185        let now = SystemTime::now();
186        let now = Timestamp::new_second(
187            now.duration_since(UNIX_EPOCH)
188                .expect("Time went backwards")
189                .as_secs() as _,
190        );
191        let lower_bound = self
192            .config
193            .expire_after
194            .map(|e| now.sub_duration(Duration::from_secs(e as _)))
195            .transpose()
196            .map_err(BoxedError::new)
197            .context(ExternalSnafu)?
198            .unwrap_or(Timestamp::new_second(0));
199        debug!(
200            "Flow {} mark range ({:?}, {:?}) as dirty",
201            self.config.flow_id, lower_bound, now
202        );
203        self.state
204            .write()
205            .unwrap()
206            .dirty_time_windows
207            .add_window(lower_bound, Some(now));
208        Ok(())
209    }
210
211    /// Create sink table if not exists
212    pub async fn check_or_create_sink_table(
213        &self,
214        engine: &QueryEngineRef,
215        frontend_client: &Arc<FrontendClient>,
216    ) -> Result<Option<(usize, Duration)>, Error> {
217        if !self.is_table_exist(&self.config.sink_table_name).await? {
218            let create_table = self.gen_create_table_expr(engine.clone()).await?;
219            info!(
220                "Try creating sink table(if not exists) with expr: {:?}",
221                create_table
222            );
223            self.create_table(frontend_client, create_table).await?;
224            info!(
225                "Sink table {}(if not exists) created",
226                self.config.sink_table_name.join(".")
227            );
228        }
229
230        Ok(None)
231    }
232
233    async fn is_table_exist(&self, table_name: &[String; 3]) -> Result<bool, Error> {
234        self.config
235            .catalog_manager
236            .table_exists(&table_name[0], &table_name[1], &table_name[2], None)
237            .await
238            .map_err(BoxedError::new)
239            .context(ExternalSnafu)
240    }
241
242    pub async fn gen_exec_once(
243        &self,
244        engine: &QueryEngineRef,
245        frontend_client: &Arc<FrontendClient>,
246        max_window_cnt: Option<usize>,
247    ) -> Result<Option<(usize, Duration)>, Error> {
248        if let Some(new_query) = self.gen_insert_plan(engine, max_window_cnt).await? {
249            debug!("Generate new query: {}", new_query.plan);
250            match self
251                .execute_logical_plan(frontend_client, &new_query.plan)
252                .await
253            {
254                Ok(result) => Ok(result),
255                Err(err) => {
256                    self.handle_executed_query_failure(Some(&new_query));
257                    Err(err)
258                }
259            }
260        } else {
261            debug!("Generate no query");
262            Ok(None)
263        }
264    }
265
266    pub async fn gen_insert_plan(
267        &self,
268        engine: &QueryEngineRef,
269        max_window_cnt: Option<usize>,
270    ) -> Result<Option<PlanInfo>, Error> {
271        let (table, df_schema) = get_table_info_df_schema(
272            self.config.catalog_manager.clone(),
273            self.config.sink_table_name.clone(),
274        )
275        .await?;
276
277        let table_meta = &table.table_info().meta;
278        let merge_mode_last_non_null =
279            is_merge_mode_last_non_null(&table_meta.options.extra_options);
280        let primary_key_indices = table_meta.primary_key_indices.clone();
281
282        let new_query = self
283            .gen_query_with_time_window(
284                engine.clone(),
285                &table.table_info().meta.schema,
286                &primary_key_indices,
287                merge_mode_last_non_null,
288                max_window_cnt,
289            )
290            .await?;
291
292        let Some(new_query) = new_query else {
293            return Ok(None);
294        };
295
296        // first check if all columns in input query exists in sink table
297        // since insert into ref to names in record batch generate by given query
298        let table_columns = df_schema
299            .columns()
300            .into_iter()
301            .map(|c| c.name)
302            .collect::<BTreeSet<_>>();
303        for column in new_query.plan.schema().columns() {
304            if !table_columns.contains(column.name()) {
305                self.restore_dirty_windows_after_failure(&new_query);
306                return InvalidQuerySnafu {
307                    reason: format!(
308                        "Column {} not found in sink table with columns {:?}",
309                        column, table_columns
310                    ),
311                }
312                .fail();
313            }
314        }
315
316        let table_provider = Arc::new(DfTableProviderAdapter::new(table));
317        let table_source = Arc::new(DefaultTableSource::new(table_provider));
318
319        // update_at& time index placeholder (if exists) should have default value
320        let plan = LogicalPlan::Dml(DmlStatement::new(
321            datafusion_common::TableReference::Full {
322                catalog: self.config.sink_table_name[0].clone().into(),
323                schema: self.config.sink_table_name[1].clone().into(),
324                table: self.config.sink_table_name[2].clone().into(),
325            },
326            table_source,
327            WriteOp::Insert(datafusion_expr::dml::InsertOp::Append),
328            Arc::new(new_query.plan.clone()),
329        ));
330        let insert_into_info = PlanInfo {
331            plan,
332            dirty_restore: new_query.dirty_restore,
333        };
334        let insert_into =
335            match insert_into_info
336                .plan
337                .clone()
338                .recompute_schema()
339                .context(DatafusionSnafu {
340                    context: "Failed to recompute schema",
341                }) {
342                Ok(insert_into) => insert_into,
343                Err(err) => {
344                    self.restore_dirty_windows_after_failure(&insert_into_info);
345                    return Err(err);
346                }
347            };
348
349        Ok(Some(PlanInfo {
350            plan: insert_into,
351            dirty_restore: insert_into_info.dirty_restore,
352        }))
353    }
354
355    pub async fn create_table(
356        &self,
357        frontend_client: &Arc<FrontendClient>,
358        expr: CreateTableExpr,
359    ) -> Result<(), Error> {
360        let catalog = &self.config.sink_table_name[0];
361        let schema = &self.config.sink_table_name[1];
362        frontend_client
363            .create(expr.clone(), catalog, schema)
364            .await?;
365        Ok(())
366    }
367
368    pub async fn execute_logical_plan(
369        &self,
370        frontend_client: &Arc<FrontendClient>,
371        plan: &LogicalPlan,
372    ) -> Result<Option<(usize, Duration)>, Error> {
373        let instant = Instant::now();
374        let flow_id = self.config.flow_id;
375
376        debug!(
377            "Executing flow {flow_id}(expire_after={:?} secs) with query {}",
378            self.config.expire_after, &plan
379        );
380
381        let catalog = &self.config.sink_table_name[0];
382        let schema = &self.config.sink_table_name[1];
383
384        // fix all table ref by make it fully qualified, i.e. "table_name" => "catalog_name.schema_name.table_name"
385        let plan = plan
386            .clone()
387            .transform_down_with_subqueries(|p| {
388                if let LogicalPlan::TableScan(mut table_scan) = p {
389                    let resolved = table_scan.table_name.resolve(catalog, schema);
390                    table_scan.table_name = resolved.into();
391                    Ok(Transformed::yes(LogicalPlan::TableScan(table_scan)))
392                } else {
393                    Ok(Transformed::no(p))
394                }
395            })
396            .with_context(|_| DatafusionSnafu {
397                context: format!("Failed to fix table ref in logical plan, plan={:?}", plan),
398            })?
399            .data;
400
401        let mut peer_desc = None;
402
403        let res = {
404            let _timer = METRIC_FLOW_BATCHING_ENGINE_QUERY_TIME
405                .with_label_values(&[flow_id.to_string().as_str()])
406                .start_timer();
407
408            let req = if let Some((insert_to, insert_plan)) =
409                breakup_insert_plan(&plan, catalog, schema)
410            {
411                let message = DFLogicalSubstraitConvertor {}
412                    .encode(&insert_plan, DefaultSerializer)
413                    .context(SubstraitEncodeLogicalPlanSnafu)?;
414                api::v1::greptime_request::Request::Query(api::v1::QueryRequest {
415                    query: Some(api::v1::query_request::Query::InsertIntoPlan(
416                        api::v1::InsertIntoPlan {
417                            table_name: Some(insert_to),
418                            logical_plan: message.to_vec(),
419                        },
420                    )),
421                })
422            } else {
423                let message = DFLogicalSubstraitConvertor {}
424                    .encode(&plan, DefaultSerializer)
425                    .context(SubstraitEncodeLogicalPlanSnafu)?;
426
427                api::v1::greptime_request::Request::Query(api::v1::QueryRequest {
428                    query: Some(api::v1::query_request::Query::LogicalPlan(message.to_vec())),
429                })
430            };
431
432            frontend_client
433                .handle(req, catalog, schema, &mut peer_desc)
434                .await
435        };
436
437        let elapsed = instant.elapsed();
438        if let Ok(affected_rows) = &res {
439            debug!(
440                "Flow {flow_id} executed, affected_rows: {affected_rows:?}, elapsed: {:?}",
441                elapsed
442            );
443            METRIC_FLOW_ROWS
444                .with_label_values(&[format!("{}-out-batching", flow_id).as_str()])
445                .inc_by(*affected_rows as _);
446        } else if let Err(err) = &res {
447            warn!(
448                "Failed to execute Flow {flow_id} on frontend {:?}, result: {err:?}, elapsed: {:?} with query: {}",
449                peer_desc, elapsed, &plan
450            );
451        }
452
453        // record slow query
454        if elapsed >= self.config.batch_opts.slow_query_threshold {
455            warn!(
456                "Flow {flow_id} on frontend {:?} executed for {:?} before complete, query: {}",
457                peer_desc, elapsed, &plan
458            );
459            METRIC_FLOW_BATCHING_ENGINE_SLOW_QUERY
460                .with_label_values(&[
461                    flow_id.to_string().as_str(),
462                    &peer_desc.unwrap_or_default().to_string(),
463                ])
464                .observe(elapsed.as_secs_f64());
465        }
466
467        self.state
468            .write()
469            .unwrap()
470            .after_query_exec(elapsed, res.is_ok());
471
472        let res = res?;
473        Ok(Some((res as usize, elapsed)))
474    }
475
476    /// Restore dirty windows consumed by a failed query so they are retried on
477    /// the next execution.
478    ///
479    fn restore_dirty_windows_after_failure(&self, query: &PlanInfo) {
480        match &query.dirty_restore {
481            DirtyRestore::Scoped(filter) => self.restore_scoped_dirty_windows(filter),
482            DirtyRestore::Unscoped(dirty_windows) => self
483                .state
484                .write()
485                .unwrap()
486                .dirty_time_windows
487                .add_dirty_windows(dirty_windows),
488        }
489    }
490
491    fn restore_scoped_dirty_windows(&self, filter: &FilterExprInfo) {
492        self.state
493            .write()
494            .unwrap()
495            .dirty_time_windows
496            .add_windows(filter.time_ranges.clone());
497    }
498
499    fn restore_scoped_dirty_windows_on_err<T>(
500        &self,
501        filter: &FilterExprInfo,
502        result: Result<T, Error>,
503    ) -> Result<T, Error> {
504        result.inspect_err(|_| {
505            self.restore_scoped_dirty_windows(filter);
506        })
507    }
508
509    fn handle_executed_query_failure(&self, query: Option<&PlanInfo>) {
510        if let Some(query) = query {
511            self.restore_dirty_windows_after_failure(query);
512        }
513    }
514
515    /// start executing query in a loop, break when receive shutdown signal
516    ///
517    /// any error will be logged when executing query
518    pub async fn start_executing_loop(
519        &self,
520        engine: QueryEngineRef,
521        frontend_client: Arc<FrontendClient>,
522    ) {
523        let flow_id_str = self.config.flow_id.to_string();
524        let mut max_window_cnt = None;
525        let mut interval = self
526            .config
527            .flow_eval_interval
528            .map(|d| tokio::time::interval(d));
529        if let Some(tick) = &mut interval {
530            tick.tick().await; // pass the first tick immediately
531        }
532        loop {
533            // first check if shutdown signal is received
534            // if so, break the loop
535            {
536                let mut state = self.state.write().unwrap();
537                match state.shutdown_rx.try_recv() {
538                    Ok(()) => break,
539                    Err(TryRecvError::Closed) => {
540                        warn!(
541                            "Unexpected shutdown flow {}, shutdown anyway",
542                            self.config.flow_id
543                        );
544                        break;
545                    }
546                    Err(TryRecvError::Empty) => (),
547                }
548            }
549            METRIC_FLOW_BATCHING_ENGINE_START_QUERY_CNT
550                .with_label_values(&[&flow_id_str])
551                .inc();
552
553            let min_refresh = self.config.batch_opts.experimental_min_refresh_duration;
554
555            let new_query = match self.gen_insert_plan(&engine, max_window_cnt).await {
556                Ok(new_query) => new_query,
557                Err(err) => {
558                    common_telemetry::error!(err; "Failed to generate query for flow={}", self.config.flow_id);
559                    // also sleep for a little while before try again to prevent flooding logs
560                    tokio::time::sleep(min_refresh).await;
561                    continue;
562                }
563            };
564
565            let res = if let Some(new_query) = &new_query {
566                self.execute_logical_plan(&frontend_client, &new_query.plan)
567                    .await
568            } else {
569                Ok(None)
570            };
571
572            match res {
573                // normal execute, sleep for some time before doing next query
574                Ok(Some(_)) => {
575                    // can increase max_window_cnt to query more windows next time
576                    max_window_cnt = max_window_cnt.map(|cnt| {
577                        (cnt + 1).min(self.config.batch_opts.experimental_max_filter_num_per_query)
578                    });
579
580                    // here use proper ticking if set eval interval
581                    if let Some(eval_interval) = &mut interval {
582                        eval_interval.tick().await;
583                    } else {
584                        // if not explicitly set, just automatically calculate next start time
585                        // using time window size and more args
586                        let sleep_until = {
587                            let state = self.state.write().unwrap();
588
589                            let time_window_size = self
590                                .config
591                                .time_window_expr
592                                .as_ref()
593                                .and_then(|t| *t.time_window_size());
594
595                            state.get_next_start_query_time(
596                                self.config.flow_id,
597                                &time_window_size,
598                                min_refresh,
599                                Some(self.config.batch_opts.query_timeout),
600                                self.config.batch_opts.experimental_max_filter_num_per_query,
601                            )
602                        };
603
604                        tokio::time::sleep_until(sleep_until).await;
605                    };
606                }
607                // no new data, sleep for some time before checking for new data
608                Ok(None) => {
609                    debug!(
610                        "Flow id = {:?} found no new data, sleep for {:?} then continue",
611                        self.config.flow_id, min_refresh
612                    );
613                    tokio::time::sleep(min_refresh).await;
614                    continue;
615                }
616                // TODO(discord9): this error should have better place to go, but for now just print error, also more context is needed
617                Err(err) => {
618                    self.handle_executed_query_failure(new_query.as_ref());
619                    METRIC_FLOW_BATCHING_ENGINE_ERROR_CNT
620                        .with_label_values(&[&flow_id_str])
621                        .inc();
622                    match new_query {
623                        Some(query) => {
624                            common_telemetry::error!(err; "Failed to execute query for flow={} with query: {}", self.config.flow_id, query.plan);
625                            // TODO(discord9): add some backoff here? half the query time window or what
626                            // backoff meaning use smaller `max_window_cnt` for next query
627
628                            // since last query failed, we should not try to query too many windows
629                            max_window_cnt = Some(1);
630                        }
631                        None => {
632                            common_telemetry::error!(err; "Failed to generate query for flow={}", self.config.flow_id)
633                        }
634                    }
635                    // also sleep for a little while before try again to prevent flooding logs
636                    tokio::time::sleep(min_refresh).await;
637                }
638            }
639        }
640    }
641
642    /// Generate the create table SQL
643    ///
644    /// the auto created table will automatically added a `update_at` Milliseconds DEFAULT now() column in the end
645    /// (for compatibility with flow streaming mode)
646    ///
647    /// and it will use first timestamp column as time index, all other columns will be added as normal columns and nullable
648    async fn gen_create_table_expr(
649        &self,
650        engine: QueryEngineRef,
651    ) -> Result<CreateTableExpr, Error> {
652        let query_ctx = self.state.read().unwrap().query_ctx.clone();
653        let plan =
654            sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.config.query, true).await?;
655        create_table_with_expr(&plan, &self.config.sink_table_name, &self.config.query_type)
656    }
657
658    /// will merge and use the first ten time window in query
659    async fn gen_query_with_time_window(
660        &self,
661        engine: QueryEngineRef,
662        sink_table_schema: &Arc<Schema>,
663        primary_key_indices: &[usize],
664        allow_partial: bool,
665        max_window_cnt: Option<usize>,
666    ) -> Result<Option<PlanInfo>, Error> {
667        let query_ctx = self.state.read().unwrap().query_ctx.clone();
668        let start = SystemTime::now();
669        let since_the_epoch = start
670            .duration_since(UNIX_EPOCH)
671            .expect("Time went backwards");
672        let low_bound = self
673            .config
674            .expire_after
675            .map(|e| since_the_epoch.as_secs() - e as u64)
676            .unwrap_or(u64::MIN);
677
678        let low_bound = Timestamp::new_second(low_bound as i64);
679
680        let expire_time_window_bound = self
681            .config
682            .time_window_expr
683            .as_ref()
684            .map(|expr| expr.eval(low_bound))
685            .transpose()?;
686
687        let (expire_lower_bound, expire_upper_bound) =
688            match (expire_time_window_bound, &self.config.query_type) {
689                (Some((Some(l), Some(u))), QueryType::Sql) => (l, u),
690                (None, QueryType::Sql) => {
691                    // if it's sql query and no time window lower/upper bound is found, just return the original query(with auto columns)
692                    // use sink_table_meta to add to query the `update_at` and `__ts_placeholder` column's value too for compatibility reason
693                    debug!(
694                        "Flow id = {:?}, no time window, using the same query",
695                        self.config.flow_id
696                    );
697                    // clean dirty time window too, this could be from create flow's check_execute
698                    let (is_dirty, dirty_windows_to_restore) = {
699                        let mut state = self.state.write().unwrap();
700                        let dirty_windows_to_restore = state.dirty_time_windows.clone();
701                        let is_dirty = !dirty_windows_to_restore.is_empty();
702                        state.dirty_time_windows.clean();
703                        (is_dirty, dirty_windows_to_restore)
704                    };
705
706                    if !is_dirty {
707                        // no dirty data, hence no need to update
708                        debug!("Flow id={:?}, no new data, not update", self.config.flow_id);
709                        return Ok(None);
710                    }
711
712                    let plan = match gen_plan_with_matching_schema(
713                        &self.config.query,
714                        query_ctx,
715                        engine,
716                        sink_table_schema.clone(),
717                        primary_key_indices,
718                        allow_partial,
719                    )
720                    .await
721                    {
722                        Ok(plan) => plan,
723                        Err(err) => {
724                            self.state
725                                .write()
726                                .unwrap()
727                                .dirty_time_windows
728                                .add_dirty_windows(&dirty_windows_to_restore);
729                            return Err(err);
730                        }
731                    };
732
733                    return Ok(Some(PlanInfo {
734                        plan,
735                        dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore),
736                    }));
737                }
738                _ => {
739                    // Clean dirty windows for full-query/non-scoped paths,
740                    // such as TQL, that cannot use a time-window filter.
741                    let dirty_windows_to_restore = {
742                        let mut state = self.state.write().unwrap();
743                        let dirty_windows_to_restore = state.dirty_time_windows.clone();
744                        state.dirty_time_windows.clean();
745                        dirty_windows_to_restore
746                    };
747
748                    let plan = match gen_plan_with_matching_schema(
749                        &self.config.query,
750                        query_ctx,
751                        engine,
752                        sink_table_schema.clone(),
753                        primary_key_indices,
754                        allow_partial,
755                    )
756                    .await
757                    {
758                        Ok(plan) => plan,
759                        Err(err) => {
760                            self.state
761                                .write()
762                                .unwrap()
763                                .dirty_time_windows
764                                .add_dirty_windows(&dirty_windows_to_restore);
765                            return Err(err);
766                        }
767                    };
768
769                    return Ok(Some(PlanInfo {
770                        plan,
771                        dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore),
772                    }));
773                }
774            };
775
776        debug!(
777            "Flow id = {:?}, found time window: precise_lower_bound={:?}, precise_upper_bound={:?} with dirty time windows: {:?}",
778            self.config.flow_id,
779            expire_lower_bound,
780            expire_upper_bound,
781            self.state.read().unwrap().dirty_time_windows
782        );
783        let window_size = expire_upper_bound
784            .sub(&expire_lower_bound)
785            .with_context(|| UnexpectedSnafu {
786                reason: format!(
787                    "Can't get window size from {expire_upper_bound:?} - {expire_lower_bound:?}"
788                ),
789            })?;
790        let col_name = self
791            .config
792            .time_window_expr
793            .as_ref()
794            .map(|expr| expr.column_name.clone())
795            .with_context(|| UnexpectedSnafu {
796                reason: format!(
797                    "Flow id={:?}, Failed to get column name from time window expr",
798                    self.config.flow_id
799                ),
800            })?;
801
802        let expr = self
803            .state
804            .write()
805            .unwrap()
806            .dirty_time_windows
807            .gen_filter_exprs(
808                &col_name,
809                Some(expire_lower_bound),
810                window_size,
811                max_window_cnt
812                    .unwrap_or(self.config.batch_opts.experimental_max_filter_num_per_query),
813                self.config.flow_id,
814                Some(self),
815            )?;
816
817        let Some(expr) = expr else {
818            // no new data, hence no need to update
819            debug!("Flow id={:?}, no new data, not update", self.config.flow_id);
820            return Ok(None);
821        };
822
823        let filter_sql = expr_to_sql(&expr.expr)
824            .map(|sql| sql.to_string())
825            .unwrap_or_else(|err| format!("<failed to format filter expr: {err}>"));
826
827        debug!(
828            "Flow id={:?}, Generated filter expr: {:?}",
829            self.config.flow_id, filter_sql
830        );
831
832        let mut add_filter = AddFilterRewriter::new(expr.expr.clone());
833        let mut add_auto_column = ColumnMatcherRewriter::new(
834            sink_table_schema.clone(),
835            primary_key_indices.to_vec(),
836            allow_partial,
837        );
838
839        let plan = self.restore_scoped_dirty_windows_on_err(
840            &expr,
841            sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.config.query, false).await,
842        )?;
843        let rewrite = self.restore_scoped_dirty_windows_on_err(
844            &expr,
845            plan.clone()
846                .rewrite(&mut add_filter)
847                .and_then(|p| p.data.rewrite(&mut add_auto_column))
848                .with_context(|_| DatafusionSnafu {
849                    context: format!("Failed to rewrite plan:\n {}\n", plan),
850                })
851                .map(|rewrite| rewrite.data),
852        )?;
853        // only apply optimize after complex rewrite is done
854        let new_plan = self.restore_scoped_dirty_windows_on_err(
855            &expr,
856            apply_df_optimizer(rewrite, &query_ctx).await,
857        )?;
858
859        let info = PlanInfo {
860            plan: new_plan.clone(),
861            dirty_restore: DirtyRestore::Scoped(expr),
862        };
863
864        Ok(Some(info))
865    }
866}
867
868#[cfg(test)]
869mod test;