Skip to main content

flow/batching_mode/
state.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Batching mode task state, which changes frequently
16//!
17
18use std::collections::{BTreeMap, BTreeSet, HashMap};
19use std::time::Duration;
20
21use common_telemetry::debug;
22use common_telemetry::tracing::warn;
23use common_time::Timestamp;
24use datatypes::value::Value;
25use session::context::QueryContextRef;
26use snafu::{OptionExt, ResultExt, ensure};
27use tokio::sync::oneshot;
28use tokio::time::Instant;
29
30use crate::batching_mode::task::BatchingTask;
31use crate::batching_mode::time_window::TimeWindowExpr;
32use crate::error::{DatatypesSnafu, InternalSnafu, TimeSnafu, UnexpectedSnafu};
33use crate::metrics::{
34    METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_CNT, METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_SIZE,
35    METRIC_FLOW_BATCHING_ENGINE_STALLED_WINDOW_SIZE,
36};
37use crate::{Error, FlowId};
38
39/// The state of the [`BatchingTask`].
40#[derive(Debug)]
41pub struct TaskState {
42    /// Query context
43    pub(crate) query_ctx: QueryContextRef,
44    /// last query complete time
45    last_update_time: Instant,
46    /// last time query duration
47    last_query_duration: Duration,
48    /// Last successful execution time in unix timestamp milliseconds.
49    last_exec_time_millis: Option<i64>,
50    /// Dirty Time windows need to be updated
51    /// mapping of `start -> end` and non-overlapping
52    pub(crate) dirty_time_windows: DirtyTimeWindows,
53    checkpoint_mode: CheckpointMode,
54    /// Region id -> last consumed watermark sequence. Incremental scans use
55    /// this as the next lower sequence bound for each source region.
56    checkpoints: BTreeMap<u64, u64>,
57    /// Once set, the task will never attempt incremental mode again.
58    /// Set when the flow's query shape is deterministically incompatible
59    /// with incremental execution (e.g. unsupported aggregate expressions).
60    incremental_disabled: bool,
61    exec_state: ExecState,
62    /// Shutdown receiver
63    pub(crate) shutdown_rx: oneshot::Receiver<()>,
64    /// Task handle
65    pub(crate) task_handle: Option<tokio::task::JoinHandle<()>>,
66}
67impl TaskState {
68    pub fn new(query_ctx: QueryContextRef, shutdown_rx: oneshot::Receiver<()>) -> Self {
69        Self {
70            query_ctx,
71            last_update_time: Instant::now(),
72            last_query_duration: Duration::from_secs(0),
73            last_exec_time_millis: None,
74            dirty_time_windows: Default::default(),
75            checkpoint_mode: CheckpointMode::FullSnapshot,
76            checkpoints: Default::default(),
77            incremental_disabled: false,
78            exec_state: ExecState::Idle,
79            shutdown_rx,
80            task_handle: None,
81        }
82    }
83
84    /// called after last query is done
85    /// `is_succ` indicate whether the last query is successful
86    pub fn after_query_exec(&mut self, elapsed: Duration, is_succ: bool) {
87        self.exec_state = ExecState::Idle;
88        self.last_query_duration = elapsed;
89        self.last_update_time = Instant::now();
90        if is_succ {
91            self.last_exec_time_millis = Some(common_time::util::current_time_millis());
92        }
93    }
94
95    pub fn last_execution_time_millis(&self) -> Option<i64> {
96        self.last_exec_time_millis
97    }
98
99    pub fn checkpoint_mode(&self) -> CheckpointMode {
100        self.checkpoint_mode
101    }
102
103    pub fn checkpoints(&self) -> &BTreeMap<u64, u64> {
104        &self.checkpoints
105    }
106
107    pub fn is_incremental_disabled(&self) -> bool {
108        self.incremental_disabled
109    }
110
111    /// Permanently disable incremental mode for this task and
112    /// immediately fall back to full snapshot for the current cycle.
113    pub fn disable_incremental(&mut self) {
114        self.incremental_disabled = true;
115        self.mark_full_snapshot();
116    }
117
118    pub fn mark_full_snapshot(&mut self) {
119        self.checkpoint_mode = CheckpointMode::FullSnapshot;
120    }
121
122    pub fn advance_checkpoints(&mut self, watermark_map: HashMap<u64, u64>) {
123        self.checkpoints = watermark_map.into_iter().collect();
124        if !self.incremental_disabled {
125            self.checkpoint_mode = CheckpointMode::Incremental;
126        }
127    }
128
129    pub fn advance_incremental_checkpoints_with_participation(
130        &mut self,
131        participating_regions: &BTreeSet<u64>,
132        watermark_map: HashMap<u64, u64>,
133    ) {
134        for region_id in participating_regions {
135            if let Some(seq) = watermark_map.get(region_id) {
136                self.checkpoints.insert(*region_id, *seq);
137            }
138        }
139        if !self.incremental_disabled {
140            self.checkpoint_mode = CheckpointMode::Incremental;
141        }
142    }
143
144    pub fn can_advance_full_snapshot_checkpoints(
145        &self,
146        participating_regions: &BTreeSet<u64>,
147        watermark_map: &HashMap<u64, u64>,
148    ) -> bool {
149        !participating_regions.is_empty()
150            && participating_regions.len() == watermark_map.len()
151            && participating_regions
152                .iter()
153                .all(|region_id| watermark_map.contains_key(region_id))
154    }
155
156    pub fn can_advance_incremental_checkpoints_with_participation(
157        &self,
158        participating_regions: &BTreeSet<u64>,
159        watermark_map: &HashMap<u64, u64>,
160    ) -> bool {
161        !self.incremental_disabled
162            && !self.checkpoints.is_empty()
163            && !participating_regions.is_empty()
164            && participating_regions.len() == watermark_map.len()
165            && participating_regions
166                .iter()
167                .all(|region_id| self.checkpoints.contains_key(region_id))
168            && participating_regions.iter().all(|region_id| {
169                let checkpoint = self.checkpoints.get(region_id);
170                watermark_map
171                    .get(region_id)
172                    .zip(checkpoint)
173                    .is_some_and(|(seq, checkpoint)| seq >= checkpoint)
174            })
175    }
176
177    /// Compute the next query delay based on the time window size or the last query duration.
178    /// Aiming to avoid too frequent queries. But also not too long delay.
179    ///
180    /// next wait time is calculated as:
181    /// last query duration, capped by [max(min_run_interval, time_window_size), max_timeout],
182    /// note at most wait for `max_timeout`.
183    ///
184    /// if current the dirty time range is longer than one query can handle,
185    /// execute immediately to faster clean up dirty time windows.
186    ///
187    /// If `prefer_short_incremental_cadence` is true, run incremental queries
188    /// more often when there is no large dirty backlog. This only reduces the
189    /// chance of hitting a stale cursor after flush; it is not required for
190    /// correctness.
191    pub fn get_next_start_query_time(
192        &self,
193        flow_id: FlowId,
194        time_window_size: &Option<Duration>,
195        min_refresh_duration: Duration,
196        max_timeout: Option<Duration>,
197        max_filter_num_per_query: usize,
198        prefer_short_incremental_cadence: bool,
199    ) -> Instant {
200        // = last query duration, capped by [max(min_run_interval, time_window_size), max_timeout], note at most `max_timeout`
201        let lower = time_window_size.unwrap_or(min_refresh_duration);
202        let next_duration = self.last_query_duration.max(lower);
203        let next_duration = if let Some(max_timeout) = max_timeout {
204            next_duration.min(max_timeout)
205        } else {
206            next_duration
207        };
208
209        let cur_dirty_window_size = self.dirty_time_windows.window_size();
210        // compute how much time range can be handled in one query
211        let max_query_update_range = (*time_window_size)
212            .unwrap_or_default()
213            .mul_f64(max_filter_num_per_query as f64);
214        // if dirty time range is more than one query can handle, execute immediately
215        // to faster clean up dirty time windows
216        if cur_dirty_window_size < max_query_update_range {
217            if prefer_short_incremental_cadence {
218                // Run incremental queries sooner than the normal time-window
219                // cadence, while still backing off by at least the previous
220                // query duration and respecting the max-timeout cap.
221                let next_duration = self.last_query_duration.max(min_refresh_duration);
222                let next_duration = if let Some(max_timeout) = max_timeout {
223                    next_duration.min(max_timeout)
224                } else {
225                    next_duration
226                };
227                self.last_update_time + next_duration
228            } else {
229                self.last_update_time + next_duration
230            }
231        } else {
232            // if dirty time windows can't be clean up in one query, execute immediately to faster
233            // clean up dirty time windows
234            debug!(
235                "Flow id = {}, still have too many {} dirty time window({:?}), execute immediately",
236                flow_id,
237                self.dirty_time_windows.windows.len(),
238                self.dirty_time_windows.windows
239            );
240            Instant::now()
241        }
242    }
243}
244
245/// For keep recording of dirty time windows, which is time window that have new data inserted
246/// since last query.
247#[derive(Debug, Clone)]
248pub struct DirtyTimeWindows {
249    /// windows's `start -> end` and non-overlapping
250    /// `end` is exclusive(and optional)
251    windows: BTreeMap<Timestamp, Option<Timestamp>>,
252    /// Maximum number of filters allowed in a single query
253    max_filter_num_per_query: usize,
254    /// Time window merge distance
255    ///
256    time_window_merge_threshold: usize,
257}
258
259impl DirtyTimeWindows {
260    pub fn new(max_filter_num_per_query: usize, time_window_merge_threshold: usize) -> Self {
261        Self {
262            windows: BTreeMap::new(),
263            max_filter_num_per_query,
264            time_window_merge_threshold,
265        }
266    }
267}
268
269impl Default for DirtyTimeWindows {
270    fn default() -> Self {
271        Self {
272            windows: BTreeMap::new(),
273            max_filter_num_per_query: 20,
274            time_window_merge_threshold: 3,
275        }
276    }
277}
278
279impl DirtyTimeWindows {
280    /// Time window merge distance
281    ///
282    /// TODO(discord9): make those configurable
283    pub const MERGE_DIST: i32 = 3;
284
285    /// Add lower bounds to the dirty time windows. Upper bounds are ignored.
286    ///
287    /// # Arguments
288    ///
289    /// * `lower_bounds` - An iterator of lower bounds to be added.
290    pub fn add_lower_bounds(&mut self, lower_bounds: impl Iterator<Item = Timestamp>) {
291        for lower_bound in lower_bounds {
292            let entry = self.windows.entry(lower_bound);
293            entry.or_insert(None);
294        }
295    }
296
297    pub fn window_size(&self) -> Duration {
298        let mut ret = Duration::from_secs(0);
299        for (start, end) in &self.windows {
300            if let Some(end) = end
301                && let Some(duration) = end.sub(start)
302            {
303                ret += duration.to_std().unwrap_or_default();
304            }
305        }
306        ret
307    }
308
309    pub fn add_window(&mut self, start: Timestamp, end: Option<Timestamp>) {
310        self.add_or_merge_window(start, end);
311    }
312
313    pub fn add_windows(&mut self, time_ranges: Vec<(Timestamp, Timestamp)>) {
314        for (start, end) in time_ranges {
315            self.add_or_merge_window(start, Some(end));
316        }
317    }
318
319    /// Add all dirty markers from another dirty-window set.
320    pub fn add_dirty_windows(&mut self, dirty_windows: &DirtyTimeWindows) {
321        for (start, end) in &dirty_windows.windows {
322            self.add_or_merge_window(*start, *end);
323        }
324    }
325
326    fn add_or_merge_window(&mut self, start: Timestamp, end: Option<Timestamp>) {
327        self.windows
328            .entry(start)
329            .and_modify(|current_end| {
330                *current_end = Self::union_window_end(*current_end, end);
331            })
332            .or_insert(end);
333    }
334
335    fn union_window_end(
336        current_end: Option<Timestamp>,
337        incoming_end: Option<Timestamp>,
338    ) -> Option<Timestamp> {
339        match (current_end, incoming_end) {
340            (Some(current), Some(incoming)) => Some(current.max(incoming)),
341            // `None` is a dirty marker without a known upper bound.  When one
342            // side has a concrete end, keep it so merging a restored snapshot
343            // never shrinks an already-known dirty range with the same start.
344            (Some(end), None) | (None, Some(end)) => Some(end),
345            (None, None) => None,
346        }
347    }
348
349    /// Clean all dirty time windows, useful when can't found time window expr
350    pub fn clean(&mut self) {
351        self.windows.clear();
352    }
353
354    /// Set windows to be dirty, only useful for full aggr without time window
355    /// to mark some new data is inserted
356    pub fn set_dirty(&mut self) {
357        self.add_or_merge_window(Timestamp::new_second(0), None);
358    }
359
360    /// Number of dirty windows.
361    pub fn len(&self) -> usize {
362        self.windows.len()
363    }
364
365    pub fn is_empty(&self) -> bool {
366        self.windows.is_empty()
367    }
368
369    /// Get the effective count of time windows, which is the number of time windows that can be
370    /// used for query, compute from total time window range divided by `window_size`.
371    pub fn effective_count(&self, window_size: &Duration) -> usize {
372        if self.windows.is_empty() {
373            return 0;
374        }
375        let window_size =
376            chrono::Duration::from_std(*window_size).unwrap_or(chrono::Duration::zero());
377        let total_window_time_range =
378            self.windows
379                .iter()
380                .fold(chrono::Duration::zero(), |acc, (start, end)| {
381                    if let Some(end) = end {
382                        acc + end.sub(start).unwrap_or(chrono::Duration::zero())
383                    } else {
384                        acc + window_size
385                    }
386                });
387
388        // not sure window_size is zero have any meaning, but just in case
389        if window_size.num_seconds() == 0 {
390            0
391        } else {
392            (total_window_time_range.num_seconds() / window_size.num_seconds()) as usize
393        }
394    }
395
396    /// Generate all filter expressions consuming all time windows
397    ///
398    /// there is two limits:
399    /// - shouldn't return a too long time range(<=`window_size * window_cnt`), so that the query can be executed in a reasonable time
400    /// - shouldn't return too many time range exprs, so that the query can be parsed properly instead of causing parser to overflow
401    pub fn gen_filter_exprs(
402        &mut self,
403        col_name: &str,
404        expire_lower_bound: Option<Timestamp>,
405        window_size: chrono::Duration,
406        window_cnt: usize,
407        flow_id: FlowId,
408        task_ctx: Option<&BatchingTask>,
409    ) -> Result<Option<FilterExprInfo>, Error> {
410        ensure!(
411            window_size.num_seconds() > 0,
412            UnexpectedSnafu {
413                reason: "window_size is zero, can't generate filter exprs",
414            }
415        );
416
417        debug!(
418            "expire_lower_bound: {:?}, window_size: {:?}",
419            expire_lower_bound.map(|t| t.to_iso8601_string()),
420            window_size
421        );
422        self.merge_dirty_time_windows(window_size, expire_lower_bound)?;
423
424        if self.windows.len() > window_cnt {
425            let first_time_window = self.windows.first_key_value();
426            let last_time_window = self.windows.last_key_value();
427
428            if let Some(task_ctx) = task_ctx {
429                warn!(
430                    "Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. Time window expr={:?}, expire_after={:?}, first_time_window={:?}, last_time_window={:?}, the original query: {:?}",
431                    task_ctx.config.flow_id,
432                    self.windows.len(),
433                    window_cnt,
434                    task_ctx.config.time_window_expr,
435                    task_ctx.config.expire_after,
436                    first_time_window,
437                    last_time_window,
438                    task_ctx.config.query
439                );
440            } else {
441                warn!(
442                    "Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. first_time_window={:?}, last_time_window={:?}",
443                    flow_id,
444                    self.windows.len(),
445                    window_cnt,
446                    first_time_window,
447                    last_time_window
448                )
449            }
450        }
451
452        // get the first `window_cnt` time windows
453        let max_time_range = window_size * window_cnt as i32;
454
455        let mut to_be_query = BTreeMap::new();
456        let mut new_windows = self.windows.clone();
457        let mut cur_time_range = chrono::Duration::zero();
458        for (idx, (start, end)) in self.windows.iter().enumerate() {
459            let first_end = start
460                .add_duration(window_size.to_std().unwrap())
461                .context(TimeSnafu)?;
462            let end = end.unwrap_or(first_end);
463
464            // if time range is too long, stop
465            if cur_time_range >= max_time_range {
466                break;
467            }
468
469            // if we have enough time windows, stop
470            if idx >= window_cnt {
471                break;
472            }
473
474            let Some(x) = end.sub(start) else {
475                continue;
476            };
477            if cur_time_range + x <= max_time_range {
478                to_be_query.insert(*start, Some(end));
479                new_windows.remove(start);
480                cur_time_range += x;
481            } else {
482                // too large a window, split it
483                // split at window_size * times
484                let surplus = max_time_range - cur_time_range;
485                if surplus.num_seconds() <= window_size.num_seconds() {
486                    // Skip splitting if surplus is smaller than window_size
487                    break;
488                }
489                let times = surplus.num_seconds() / window_size.num_seconds();
490
491                let split_offset = window_size * times as i32;
492                let split_at = start
493                    .add_duration(split_offset.to_std().unwrap())
494                    .context(TimeSnafu)?;
495                to_be_query.insert(*start, Some(split_at));
496
497                // remove the original window
498                new_windows.remove(start);
499                new_windows.insert(split_at, Some(end));
500                cur_time_range += split_offset;
501                break;
502            }
503        }
504
505        self.windows = new_windows;
506
507        METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_CNT
508            .with_label_values(&[flow_id.to_string().as_str()])
509            .observe(to_be_query.len() as f64);
510
511        let full_time_range = to_be_query
512            .iter()
513            .fold(chrono::Duration::zero(), |acc, (start, end)| {
514                if let Some(end) = end {
515                    acc + end.sub(start).unwrap_or(chrono::Duration::zero())
516                } else {
517                    acc + window_size
518                }
519            })
520            .num_seconds() as f64;
521        METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_SIZE
522            .with_label_values(&[flow_id.to_string().as_str()])
523            .observe(full_time_range);
524
525        let stalled_time_range =
526            self.windows
527                .iter()
528                .fold(chrono::Duration::zero(), |acc, (start, end)| {
529                    if let Some(end) = end {
530                        acc + end.sub(start).unwrap_or(chrono::Duration::zero())
531                    } else {
532                        acc + window_size
533                    }
534                });
535
536        METRIC_FLOW_BATCHING_ENGINE_STALLED_WINDOW_SIZE
537            .with_label_values(&[flow_id.to_string().as_str()])
538            .observe(stalled_time_range.num_seconds() as f64);
539
540        let std_window_size = window_size.to_std().map_err(|e| {
541            InternalSnafu {
542                reason: e.to_string(),
543            }
544            .build()
545        })?;
546
547        let mut expr_lst = vec![];
548        let mut time_ranges = vec![];
549        for (start, end) in to_be_query.into_iter() {
550            // align using time window exprs
551            let (start, end) = if let Some(ctx) = task_ctx {
552                let Some(time_window_expr) = &ctx.config.time_window_expr else {
553                    UnexpectedSnafu {
554                        reason: "time_window_expr is not set",
555                    }
556                    .fail()?
557                };
558                self.align_time_window(start, end, time_window_expr)?
559            } else {
560                (start, end)
561            };
562            let end = end.unwrap_or(start.add_duration(std_window_size).context(TimeSnafu)?);
563            time_ranges.push((start, end));
564
565            debug!(
566                "Time window start: {:?}, end: {:?}",
567                start.to_iso8601_string(),
568                end.to_iso8601_string()
569            );
570
571            use datafusion_expr::{col, lit};
572            let lower = to_df_literal(start)?;
573            let upper = to_df_literal(end)?;
574            let expr = col(col_name)
575                .gt_eq(lit(lower))
576                .and(col(col_name).lt(lit(upper)));
577            expr_lst.push(expr);
578        }
579        let expr = expr_lst.into_iter().reduce(|a, b| a.or(b));
580        let ret = expr.map(|expr| FilterExprInfo {
581            expr,
582            col_name: col_name.to_string(),
583            time_ranges,
584            window_size,
585        });
586        Ok(ret)
587    }
588
589    fn align_time_window(
590        &self,
591        start: Timestamp,
592        end: Option<Timestamp>,
593        time_window_expr: &TimeWindowExpr,
594    ) -> Result<(Timestamp, Option<Timestamp>), Error> {
595        let align_start = time_window_expr.eval(start)?.0.context(UnexpectedSnafu {
596            reason: format!(
597                "Failed to align start time {:?} with time window expr {:?}",
598                start, time_window_expr
599            ),
600        })?;
601        let align_end = end
602            .and_then(|end| {
603                time_window_expr
604                    .eval(end)
605                    // if after aligned, end is the same, then use end(because it's already aligned) else use aligned end
606                    .map(|r| if r.0 == Some(end) { r.0 } else { r.1 })
607                    .transpose()
608            })
609            .transpose()?;
610        Ok((align_start, align_end))
611    }
612
613    /// Merge time windows that overlaps or get too close
614    ///
615    /// TODO(discord9): not merge and prefer to send smaller time windows? how?
616    pub fn merge_dirty_time_windows(
617        &mut self,
618        window_size: chrono::Duration,
619        expire_lower_bound: Option<Timestamp>,
620    ) -> Result<(), Error> {
621        if self.windows.is_empty() {
622            return Ok(());
623        }
624
625        let mut new_windows = BTreeMap::new();
626
627        let std_window_size = window_size.to_std().map_err(|e| {
628            InternalSnafu {
629                reason: e.to_string(),
630            }
631            .build()
632        })?;
633
634        // previous time window
635        let mut prev_tw = None;
636        for (lower_bound, upper_bound) in std::mem::take(&mut self.windows) {
637            // filter out expired time window
638            if let Some(expire_lower_bound) = expire_lower_bound
639                && lower_bound < expire_lower_bound
640            {
641                continue;
642            }
643
644            let Some(prev_tw) = &mut prev_tw else {
645                prev_tw = Some((lower_bound, upper_bound));
646                continue;
647            };
648
649            // if cur.lower - prev.upper <= window_size * MERGE_DIST, merge
650            // this also deal with overlap windows because cur.lower > prev.lower is always true
651            let prev_upper = prev_tw
652                .1
653                .unwrap_or(prev_tw.0.add_duration(std_window_size).context(TimeSnafu)?);
654            prev_tw.1 = Some(prev_upper);
655
656            let cur_upper = upper_bound.unwrap_or(
657                lower_bound
658                    .add_duration(std_window_size)
659                    .context(TimeSnafu)?,
660            );
661
662            if lower_bound
663                .sub(&prev_upper)
664                .map(|dist| dist <= window_size * self.time_window_merge_threshold as i32)
665                .unwrap_or(false)
666            {
667                prev_tw.1 = Some(cur_upper);
668            } else {
669                new_windows.insert(prev_tw.0, prev_tw.1);
670                *prev_tw = (lower_bound, Some(cur_upper));
671            }
672        }
673
674        if let Some(prev_tw) = prev_tw {
675            new_windows.insert(prev_tw.0, prev_tw.1);
676        }
677
678        self.windows = new_windows;
679
680        Ok(())
681    }
682}
683
684fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
685    let value = Value::from(value);
686    let value = value
687        .try_to_scalar_value(&value.data_type())
688        .with_context(|_| DatatypesSnafu {
689            extra: format!("Failed to convert to scalar value: {}", value),
690        })?;
691    Ok(value)
692}
693
694#[derive(Debug, Clone)]
695enum ExecState {
696    Idle,
697    Executing,
698}
699
700#[derive(Debug, Clone, Copy, PartialEq, Eq)]
701pub enum CheckpointMode {
702    FullSnapshot,
703    Incremental,
704}
705
706/// Filter Expression's information
707#[derive(Debug, Clone)]
708pub struct FilterExprInfo {
709    pub expr: datafusion_expr::Expr,
710    pub col_name: String,
711    pub time_ranges: Vec<(Timestamp, Timestamp)>,
712    pub window_size: chrono::Duration,
713}
714
715impl FilterExprInfo {
716    pub fn total_window_length(&self) -> chrono::Duration {
717        self.time_ranges
718            .iter()
719            .fold(chrono::Duration::zero(), |acc, (start, end)| {
720                acc + end.sub(start).unwrap_or(chrono::Duration::zero())
721            })
722    }
723
724    pub fn predicate_for_col(
725        &self,
726        col_name: &str,
727    ) -> Result<Option<datafusion_expr::Expr>, Error> {
728        use datafusion_common::Column;
729        use datafusion_expr::{Expr, lit};
730
731        let mut expr_lst = Vec::with_capacity(self.time_ranges.len());
732        for (start, end) in &self.time_ranges {
733            let lower = to_df_literal(*start)?;
734            let upper = to_df_literal(*end)?;
735            let filter_col = || Expr::Column(Column::new_unqualified(col_name));
736            expr_lst.push(
737                filter_col()
738                    .gt_eq(lit(lower))
739                    .and(filter_col().lt(lit(upper))),
740            );
741        }
742
743        Ok(expr_lst.into_iter().reduce(|a, b| a.or(b)))
744    }
745}
746
747#[cfg(test)]
748mod test {
749    use pretty_assertions::assert_eq;
750    use session::context::QueryContext;
751
752    use super::*;
753    use crate::batching_mode::time_window::find_time_window_expr;
754    use crate::batching_mode::utils::sql_to_df_plan;
755    use crate::test_utils::create_test_query_engine;
756
757    #[test]
758    fn test_task_state_records_last_execution_time() {
759        let query_ctx = QueryContext::arc();
760        let (_tx, rx) = tokio::sync::oneshot::channel();
761        let mut state = TaskState::new(query_ctx, rx);
762
763        assert_eq!(None, state.last_execution_time_millis());
764        state.after_query_exec(std::time::Duration::from_millis(1), false);
765        assert_eq!(None, state.last_execution_time_millis());
766
767        state.after_query_exec(std::time::Duration::from_millis(1), true);
768        assert!(state.last_execution_time_millis().is_some());
769    }
770
771    #[test]
772    fn test_merge_dirty_time_windows() {
773        let merge_dist = DirtyTimeWindows::default().time_window_merge_threshold;
774        let testcases = vec![
775            // just enough to merge
776            (
777                vec![
778                    Timestamp::new_second(0),
779                    Timestamp::new_second((1 + merge_dist as i64) * 5 * 60),
780                ],
781                (chrono::Duration::seconds(5 * 60), None),
782                BTreeMap::from([(
783                    Timestamp::new_second(0),
784                    Some(Timestamp::new_second((2 + merge_dist as i64) * 5 * 60)),
785                )]),
786                Some(
787                    "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:25:00' AS TIMESTAMP)))",
788                ),
789            ),
790            // separate time window
791            (
792                vec![
793                    Timestamp::new_second(0),
794                    Timestamp::new_second((2 + merge_dist as i64) * 5 * 60),
795                ],
796                (chrono::Duration::seconds(5 * 60), None),
797                BTreeMap::from([
798                    (
799                        Timestamp::new_second(0),
800                        Some(Timestamp::new_second(5 * 60)),
801                    ),
802                    (
803                        Timestamp::new_second((2 + merge_dist as i64) * 5 * 60),
804                        Some(Timestamp::new_second((3 + merge_dist as i64) * 5 * 60)),
805                    ),
806                ]),
807                Some(
808                    "(((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:05:00' AS TIMESTAMP))) OR ((ts >= CAST('1970-01-01 00:25:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:30:00' AS TIMESTAMP))))",
809                ),
810            ),
811            // overlapping
812            (
813                vec![
814                    Timestamp::new_second(0),
815                    Timestamp::new_second((merge_dist as i64) * 5 * 60),
816                ],
817                (chrono::Duration::seconds(5 * 60), None),
818                BTreeMap::from([(
819                    Timestamp::new_second(0),
820                    Some(Timestamp::new_second((1 + merge_dist as i64) * 5 * 60)),
821                )]),
822                Some(
823                    "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:20:00' AS TIMESTAMP)))",
824                ),
825            ),
826            // complex overlapping
827            (
828                vec![
829                    Timestamp::new_second(0),
830                    Timestamp::new_second((merge_dist as i64) * 3),
831                    Timestamp::new_second((merge_dist as i64) * 3 * 2),
832                ],
833                (chrono::Duration::seconds(3), None),
834                BTreeMap::from([(
835                    Timestamp::new_second(0),
836                    Some(Timestamp::new_second((merge_dist as i64) * 7)),
837                )]),
838                Some(
839                    "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:00:21' AS TIMESTAMP)))",
840                ),
841            ),
842            // split range
843            (
844                Vec::from_iter((0..20).map(|i| Timestamp::new_second(i * 3)).chain(
845                    std::iter::once(Timestamp::new_second(
846                        60 + 3 * (DirtyTimeWindows::MERGE_DIST as i64 + 1),
847                    )),
848                )),
849                (chrono::Duration::seconds(3), None),
850                BTreeMap::from([
851                    (Timestamp::new_second(0), Some(Timestamp::new_second(60))),
852                    (
853                        Timestamp::new_second(60 + 3 * (DirtyTimeWindows::MERGE_DIST as i64 + 1)),
854                        Some(Timestamp::new_second(
855                            60 + 3 * (DirtyTimeWindows::MERGE_DIST as i64 + 1) + 3,
856                        )),
857                    ),
858                ]),
859                Some(
860                    "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:01:00' AS TIMESTAMP)))",
861                ),
862            ),
863            // split 2 min into 1 min
864            (
865                Vec::from_iter((0..40).map(|i| Timestamp::new_second(i * 3))),
866                (chrono::Duration::seconds(3), None),
867                BTreeMap::from([(
868                    Timestamp::new_second(0),
869                    Some(Timestamp::new_second(40 * 3)),
870                )]),
871                Some(
872                    "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:01:00' AS TIMESTAMP)))",
873                ),
874            ),
875            // split 3s + 1min into 3s + 57s
876            (
877                Vec::from_iter(
878                    std::iter::once(Timestamp::new_second(0))
879                        .chain((0..40).map(|i| Timestamp::new_second(20 + i * 3))),
880                ),
881                (chrono::Duration::seconds(3), None),
882                BTreeMap::from([
883                    (Timestamp::new_second(0), Some(Timestamp::new_second(3))),
884                    (Timestamp::new_second(20), Some(Timestamp::new_second(140))),
885                ]),
886                Some(
887                    "(((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:00:03' AS TIMESTAMP))) OR ((ts >= CAST('1970-01-01 00:00:20' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:01:17' AS TIMESTAMP))))",
888                ),
889            ),
890            // expired
891            (
892                vec![
893                    Timestamp::new_second(0),
894                    Timestamp::new_second((merge_dist as i64) * 5 * 60),
895                ],
896                (
897                    chrono::Duration::seconds(5 * 60),
898                    Some(Timestamp::new_second((merge_dist as i64) * 6 * 60)),
899                ),
900                BTreeMap::from([]),
901                None,
902            ),
903        ];
904        // let len = testcases.len();
905        // let testcases = testcases[(len - 2)..(len - 1)].to_vec();
906        for (lower_bounds, (window_size, expire_lower_bound), expected, expected_filter_expr) in
907            testcases
908        {
909            let mut dirty = DirtyTimeWindows::default();
910            dirty.add_lower_bounds(lower_bounds.into_iter());
911            dirty
912                .merge_dirty_time_windows(window_size, expire_lower_bound)
913                .unwrap();
914            assert_eq!(expected, dirty.windows);
915            let filter_expr = dirty
916                .gen_filter_exprs(
917                    "ts",
918                    expire_lower_bound,
919                    window_size,
920                    dirty.max_filter_num_per_query,
921                    0,
922                    None,
923                )
924                .unwrap()
925                .map(|e| e.expr);
926
927            let unparser = datafusion::sql::unparser::Unparser::default();
928            let to_sql = filter_expr
929                .as_ref()
930                .map(|e| unparser.expr_to_sql(e).unwrap().to_string());
931            assert_eq!(expected_filter_expr, to_sql.as_deref());
932        }
933    }
934
935    #[tokio::test]
936    async fn test_align_time_window() {
937        type TimeWindow = (Timestamp, Option<Timestamp>);
938        struct TestCase {
939            sql: String,
940            aligns: Vec<(TimeWindow, TimeWindow)>,
941        }
942        let testcases: Vec<TestCase> = vec![TestCase{
943            sql: "SELECT date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window;".to_string(),
944            aligns: vec![
945                ((Timestamp::new_second(3), None), (Timestamp::new_second(0), None)),
946                ((Timestamp::new_second(8), None), (Timestamp::new_second(5), None)),
947                ((Timestamp::new_second(8), Some(Timestamp::new_second(10))), (Timestamp::new_second(5), Some(Timestamp::new_second(10)))),
948                ((Timestamp::new_second(8), Some(Timestamp::new_second(9))), (Timestamp::new_second(5), Some(Timestamp::new_second(10)))),
949            ],
950        }];
951
952        let query_engine = create_test_query_engine();
953        let ctx = QueryContext::arc();
954        for TestCase { sql, aligns } in testcases {
955            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), &sql, true)
956                .await
957                .unwrap();
958
959            let (column_name, time_window_expr, _, df_schema) = find_time_window_expr(
960                &plan,
961                query_engine.engine_state().catalog_manager().clone(),
962                ctx.clone(),
963            )
964            .await
965            .unwrap();
966
967            let time_window_expr = time_window_expr
968                .map(|expr| {
969                    TimeWindowExpr::from_expr(
970                        &expr,
971                        &column_name,
972                        &df_schema,
973                        &query_engine.engine_state().session_state(),
974                    )
975                })
976                .transpose()
977                .unwrap()
978                .unwrap();
979
980            let dirty = DirtyTimeWindows::default();
981            for (before_align, expected_after_align) in aligns {
982                let after_align = dirty
983                    .align_time_window(before_align.0, before_align.1, &time_window_expr)
984                    .unwrap();
985                assert_eq!(expected_after_align, after_align);
986            }
987        }
988    }
989
990    #[test]
991    fn test_task_state_checkpoint_mode_and_advancement() {
992        let query_ctx = QueryContext::arc();
993        let (_tx, rx) = tokio::sync::oneshot::channel();
994        let mut state = TaskState::new(query_ctx, rx);
995
996        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
997        assert!(state.checkpoints().is_empty());
998
999        state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
1000        assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
1001        assert_eq!(
1002            state.checkpoints(),
1003            &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
1004        );
1005
1006        state.mark_full_snapshot();
1007        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1008        assert_eq!(
1009            state.checkpoints(),
1010            &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
1011        );
1012    }
1013
1014    #[test]
1015    fn test_disable_incremental_persists_full_snapshot_mode() {
1016        let query_ctx = QueryContext::arc();
1017        let (_tx, rx) = tokio::sync::oneshot::channel();
1018        let mut state = TaskState::new(query_ctx, rx);
1019
1020        assert!(!state.is_incremental_disabled());
1021
1022        // After disable, mode becomes FullSnapshot and flag is set.
1023        state.disable_incremental();
1024        assert!(state.is_incremental_disabled());
1025        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1026
1027        // `advance_checkpoints` will NOT transition to Incremental when disabled.
1028        state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
1029        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1030        assert_eq!(
1031            state.checkpoints(),
1032            &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
1033        );
1034
1035        // `mark_full_snapshot` does not re-enable incremental.
1036        state.mark_full_snapshot();
1037        assert!(state.is_incremental_disabled());
1038        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1039    }
1040
1041    #[test]
1042    fn test_full_snapshot_checkpoint_advancement_requires_participating_regions() {
1043        let query_ctx = QueryContext::arc();
1044        let (_tx, rx) = tokio::sync::oneshot::channel();
1045        let state = TaskState::new(query_ctx, rx);
1046
1047        assert!(!state.can_advance_full_snapshot_checkpoints(&BTreeSet::new(), &HashMap::new()));
1048        assert!(!state.can_advance_full_snapshot_checkpoints(
1049            &BTreeSet::from([1_u64, 2_u64]),
1050            &HashMap::from([(1_u64, 10_u64)]),
1051        ));
1052        assert!(state.can_advance_full_snapshot_checkpoints(
1053            &BTreeSet::from([1_u64, 2_u64]),
1054            &HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]),
1055        ));
1056    }
1057
1058    #[test]
1059    fn test_incremental_checkpoint_advancement_requires_participation_alignment() {
1060        let query_ctx = QueryContext::arc();
1061        let (_tx, rx) = tokio::sync::oneshot::channel();
1062        let mut state = TaskState::new(query_ctx, rx);
1063        state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
1064
1065        assert!(
1066            state.can_advance_incremental_checkpoints_with_participation(
1067                &BTreeSet::from([1_u64]),
1068                &HashMap::from([(1_u64, 11_u64)]),
1069            )
1070        );
1071        assert!(
1072            !state.can_advance_incremental_checkpoints_with_participation(
1073                &BTreeSet::from([1_u64, 2_u64]),
1074                &HashMap::from([(1_u64, 11_u64)]),
1075            )
1076        );
1077        assert!(
1078            !state.can_advance_incremental_checkpoints_with_participation(
1079                &BTreeSet::from([3_u64]),
1080                &HashMap::from([(3_u64, 11_u64)]),
1081            )
1082        );
1083        assert!(
1084            !state.can_advance_incremental_checkpoints_with_participation(
1085                &BTreeSet::from([1_u64]),
1086                &HashMap::from([(1_u64, 9_u64)]),
1087            )
1088        );
1089        assert!(
1090            state.can_advance_incremental_checkpoints_with_participation(
1091                &BTreeSet::from([1_u64, 2_u64]),
1092                &HashMap::from([(1_u64, 11_u64), (2_u64, 21_u64)]),
1093            )
1094        );
1095
1096        state.disable_incremental();
1097        assert!(
1098            !state.can_advance_incremental_checkpoints_with_participation(
1099                &BTreeSet::from([1_u64, 2_u64]),
1100                &HashMap::from([(1_u64, 12_u64), (2_u64, 22_u64)]),
1101            )
1102        );
1103    }
1104
1105    #[test]
1106    fn test_incremental_checkpoint_advancement_merges_participating_subset() {
1107        let query_ctx = QueryContext::arc();
1108        let (_tx, rx) = tokio::sync::oneshot::channel();
1109        let mut state = TaskState::new(query_ctx, rx);
1110        state.advance_checkpoints(HashMap::from([
1111            (1_u64, 10_u64),
1112            (2_u64, 20_u64),
1113            (3_u64, 30_u64),
1114        ]));
1115
1116        state.advance_incremental_checkpoints_with_participation(
1117            &BTreeSet::from([1_u64, 3_u64]),
1118            HashMap::from([(1_u64, 12_u64), (3_u64, 35_u64)]),
1119        );
1120
1121        assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
1122        assert_eq!(
1123            state.checkpoints(),
1124            &BTreeMap::from([(1_u64, 12_u64), (2_u64, 20_u64), (3_u64, 35_u64)])
1125        );
1126    }
1127
1128    #[test]
1129    fn test_filter_expr_info_predicate_for_col_empty_ranges() {
1130        let filter = FilterExprInfo {
1131            expr: datafusion_expr::col("ts"),
1132            col_name: "ts".to_string(),
1133            time_ranges: vec![],
1134            window_size: chrono::Duration::seconds(1),
1135        };
1136
1137        assert!(filter.predicate_for_col("time_window").unwrap().is_none());
1138    }
1139
1140    #[test]
1141    fn test_filter_expr_info_predicate_for_col_single_range() {
1142        let filter = FilterExprInfo {
1143            expr: datafusion_expr::col("ts"),
1144            col_name: "ts".to_string(),
1145            time_ranges: vec![(Timestamp::new_second(0), Timestamp::new_second(1))],
1146            window_size: chrono::Duration::seconds(1),
1147        };
1148
1149        let predicate = filter.predicate_for_col("time_window").unwrap().unwrap();
1150        let unparser = datafusion::sql::unparser::Unparser::default();
1151        assert_eq!(
1152            "((time_window >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:01' AS TIMESTAMP)))",
1153            unparser.expr_to_sql(&predicate).unwrap().to_string()
1154        );
1155    }
1156
1157    #[test]
1158    fn test_filter_expr_info_predicate_for_col_multiple_ranges() {
1159        let filter = FilterExprInfo {
1160            expr: datafusion_expr::col("ts"),
1161            col_name: "ts".to_string(),
1162            time_ranges: vec![
1163                (Timestamp::new_second(0), Timestamp::new_second(1)),
1164                (Timestamp::new_second(10), Timestamp::new_second(11)),
1165            ],
1166            window_size: chrono::Duration::seconds(1),
1167        };
1168
1169        let predicate = filter.predicate_for_col("time_window").unwrap().unwrap();
1170        let unparser = datafusion::sql::unparser::Unparser::default();
1171        assert_eq!(
1172            "(((time_window >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:01' AS TIMESTAMP))) OR ((time_window >= CAST('1970-01-01 00:00:10' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:11' AS TIMESTAMP))))",
1173            unparser.expr_to_sql(&predicate).unwrap().to_string()
1174        );
1175    }
1176
1177    /// Helper: create a `TaskState` whose `last_update_time` is a known duration in the past.
1178    fn state_with_past_update(age: Duration) -> TaskState {
1179        let query_ctx = QueryContext::arc();
1180        let (_tx, rx) = tokio::sync::oneshot::channel();
1181        let mut state = TaskState::new(query_ctx, rx);
1182        state.last_update_time = Instant::now() - age;
1183        state
1184    }
1185
1186    #[test]
1187    fn test_short_incremental_cadence_uses_min_refresh() {
1188        // When prefer_short_incremental_cadence is true and dirty backlog is manageable,
1189        // the next start time should be last_update_time + min_refresh (short cadence),
1190        // ignoring the longer time_window_size.
1191        let state = state_with_past_update(Duration::from_secs(10));
1192
1193        let time_window_size = Some(Duration::from_secs(60)); // large window
1194        let min_refresh = Duration::from_secs(5);
1195        let flow_id = 1;
1196
1197        let result = state.get_next_start_query_time(
1198            flow_id,
1199            &time_window_size,
1200            min_refresh,
1201            None,
1202            20,
1203            true, // prefer_short_incremental_cadence
1204        );
1205
1206        // With short cadence, result should be last_update_time + min_refresh.
1207        let expected = state.last_update_time + min_refresh;
1208        assert_eq!(result, expected);
1209    }
1210
1211    #[test]
1212    fn test_short_incremental_cadence_respects_last_query_duration() {
1213        let mut state = state_with_past_update(Duration::from_secs(10));
1214        state.last_query_duration = Duration::from_secs(20);
1215
1216        let time_window_size = Some(Duration::from_secs(60));
1217        let min_refresh = Duration::from_secs(5);
1218        let flow_id = 1;
1219
1220        let result = state.get_next_start_query_time(
1221            flow_id,
1222            &time_window_size,
1223            min_refresh,
1224            None,
1225            20,
1226            true,
1227        );
1228
1229        assert_eq!(result, state.last_update_time + state.last_query_duration);
1230    }
1231
1232    #[test]
1233    fn test_short_incremental_cadence_respects_max_timeout() {
1234        let mut state = state_with_past_update(Duration::from_secs(10));
1235        state.last_query_duration = Duration::from_secs(20);
1236
1237        let time_window_size = Some(Duration::from_secs(60));
1238        let min_refresh = Duration::from_secs(30);
1239        let max_timeout = Duration::from_secs(5);
1240        let flow_id = 1;
1241
1242        let result = state.get_next_start_query_time(
1243            flow_id,
1244            &time_window_size,
1245            min_refresh,
1246            Some(max_timeout),
1247            20,
1248            true,
1249        );
1250
1251        assert_eq!(result, state.last_update_time + max_timeout);
1252    }
1253
1254    #[test]
1255    fn test_full_snapshot_ignores_short_cadence() {
1256        // When prefer_short_incremental_cadence is false (full snapshot mode),
1257        // the normal long-cadence based on time_window_size applies.
1258        let mut state = state_with_past_update(Duration::from_secs(10));
1259        // Make last_query_duration small so the lower bound (time_window_size) dominates.
1260        state.last_query_duration = Duration::from_secs(1);
1261
1262        let time_window_size = Some(Duration::from_secs(60)); // large window
1263        let min_refresh = Duration::from_secs(5);
1264        let flow_id = 1;
1265
1266        let result = state.get_next_start_query_time(
1267            flow_id,
1268            &time_window_size,
1269            min_refresh,
1270            None,
1271            20,
1272            false, // prefer_short_incremental_cadence = false
1273        );
1274
1275        // With normal cadence, result should be last_update_time + time_window_size
1276        // (since last_query_duration < time_window_size).
1277        let expected = state.last_update_time + Duration::from_secs(60);
1278        assert_eq!(result, expected);
1279    }
1280
1281    #[test]
1282    fn test_dirty_window_overflow_schedules_immediately_even_with_short_cadence() {
1283        // Dirty-window overflow must always schedule immediately,
1284        // regardless of prefer_short_incremental_cadence.
1285        let mut state = state_with_past_update(Duration::from_secs(10));
1286        // Create a very large dirty backlog.
1287        state
1288            .dirty_time_windows
1289            .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(3600)));
1290
1291        let time_window_size = Some(Duration::from_secs(1)); // tiny window => overflow
1292        let min_refresh = Duration::from_secs(5);
1293        let flow_id = 1;
1294
1295        // With short cadence flag.
1296        let result = state.get_next_start_query_time(
1297            flow_id,
1298            &time_window_size,
1299            min_refresh,
1300            None,
1301            1, // max 1 filter => tiny capacity
1302            true,
1303        );
1304        assert!(
1305            result <= Instant::now(),
1306            "dirty overflow should schedule immediately"
1307        );
1308
1309        // Without short cadence flag — same behavior.
1310        let result2 = state.get_next_start_query_time(
1311            flow_id,
1312            &time_window_size,
1313            min_refresh,
1314            None,
1315            1,
1316            false,
1317        );
1318        assert!(
1319            result2 <= Instant::now(),
1320            "dirty overflow should schedule immediately"
1321        );
1322    }
1323
1324    #[test]
1325    fn test_incremental_disabled_ignores_short_cadence() {
1326        // When prefer_short_incremental_cadence is true but the dirty backlog is
1327        // manageable, the short cadence is applied. This test verifies that the
1328        // caller-side guard (checkpoint_mode + !is_incremental_disabled) controls
1329        // whether short cadence is requested at all — when incremental is disabled,
1330        // the flag is false, and the long cadence applies.
1331        //
1332        // This simulates the case where the caller computed
1333        // prefer_short_incremental_cadence = false (e.g. incremental disabled
1334        // or FullSnapshot mode), so the long cadence is used.
1335        let mut state = state_with_past_update(Duration::from_secs(10));
1336        state.last_query_duration = Duration::from_secs(1);
1337
1338        let time_window_size = Some(Duration::from_secs(60));
1339        let min_refresh = Duration::from_secs(5);
1340        let flow_id = 1;
1341
1342        let result = state.get_next_start_query_time(
1343            flow_id,
1344            &time_window_size,
1345            min_refresh,
1346            None,
1347            20,
1348            false, // prefer_short_incremental_cadence = false
1349        );
1350
1351        // With normal cadence, result should be last_update_time + time_window_size.
1352        let expected = state.last_update_time + Duration::from_secs(60);
1353        assert_eq!(result, expected);
1354    }
1355}