Skip to main content

flow/batching_mode/
state.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Batching mode task state, which changes frequently
16//!
17
18use std::collections::{BTreeMap, BTreeSet, HashMap};
19use std::time::Duration;
20
21use common_telemetry::debug;
22use common_telemetry::tracing::warn;
23use common_time::Timestamp;
24use datatypes::value::Value;
25use session::context::QueryContextRef;
26use snafu::{OptionExt, ResultExt, ensure};
27use tokio::sync::oneshot;
28use tokio::time::Instant;
29
30use crate::batching_mode::task::BatchingTask;
31use crate::batching_mode::time_window::TimeWindowExpr;
32use crate::error::{DatatypesSnafu, InternalSnafu, TimeSnafu, UnexpectedSnafu};
33use crate::metrics::{
34    METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_CNT, METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_SIZE,
35    METRIC_FLOW_BATCHING_ENGINE_STALLED_WINDOW_SIZE,
36};
37use crate::{Error, FlowId};
38
39/// The state of the [`BatchingTask`].
40#[derive(Debug)]
41pub struct TaskState {
42    /// Query context
43    pub(crate) query_ctx: QueryContextRef,
44    /// last query complete time
45    last_update_time: Instant,
46    /// last time query duration
47    last_query_duration: Duration,
48    /// Last successful execution time in unix timestamp milliseconds.
49    last_exec_time_millis: Option<i64>,
50    /// Dirty Time windows need to be updated
51    /// mapping of `start -> end` and non-overlapping
52    pub(crate) dirty_time_windows: DirtyTimeWindows,
53    checkpoint_mode: CheckpointMode,
54    /// Region id -> last consumed watermark sequence. Incremental scans use
55    /// this as the next lower sequence bound for each source region.
56    checkpoints: BTreeMap<u64, u64>,
57    /// Once set, the task will never attempt incremental mode again.
58    /// Set when the flow's query shape is deterministically incompatible
59    /// with incremental execution (e.g. unsupported aggregate expressions).
60    incremental_disabled: bool,
61    exec_state: ExecState,
62    /// Shutdown receiver
63    pub(crate) shutdown_rx: oneshot::Receiver<()>,
64    /// Task handle
65    pub(crate) task_handle: Option<tokio::task::JoinHandle<()>>,
66}
67impl TaskState {
68    pub fn new(query_ctx: QueryContextRef, shutdown_rx: oneshot::Receiver<()>) -> Self {
69        Self::with_dirty_time_windows(query_ctx, shutdown_rx, DirtyTimeWindows::default())
70    }
71
72    pub fn with_dirty_time_windows(
73        query_ctx: QueryContextRef,
74        shutdown_rx: oneshot::Receiver<()>,
75        dirty_time_windows: DirtyTimeWindows,
76    ) -> Self {
77        Self {
78            query_ctx,
79            last_update_time: Instant::now(),
80            last_query_duration: Duration::from_secs(0),
81            last_exec_time_millis: None,
82            dirty_time_windows,
83            checkpoint_mode: CheckpointMode::FullSnapshot,
84            checkpoints: Default::default(),
85            incremental_disabled: false,
86            exec_state: ExecState::Idle,
87            shutdown_rx,
88            task_handle: None,
89        }
90    }
91
92    /// called after last query is done
93    /// `is_succ` indicate whether the last query is successful
94    pub fn after_query_exec(&mut self, elapsed: Duration, is_succ: bool) {
95        self.exec_state = ExecState::Idle;
96        self.last_query_duration = elapsed;
97        self.last_update_time = Instant::now();
98        if is_succ {
99            self.last_exec_time_millis = Some(common_time::util::current_time_millis());
100        }
101    }
102
103    pub fn last_execution_time_millis(&self) -> Option<i64> {
104        self.last_exec_time_millis
105    }
106
107    pub fn checkpoint_mode(&self) -> CheckpointMode {
108        self.checkpoint_mode
109    }
110
111    pub fn checkpoints(&self) -> &BTreeMap<u64, u64> {
112        &self.checkpoints
113    }
114
115    pub fn is_incremental_disabled(&self) -> bool {
116        self.incremental_disabled
117    }
118
119    /// Permanently disable incremental mode for this task and
120    /// immediately fall back to full snapshot for the current cycle.
121    pub fn disable_incremental(&mut self) {
122        self.incremental_disabled = true;
123        self.mark_full_snapshot();
124    }
125
126    pub fn mark_full_snapshot(&mut self) {
127        self.checkpoint_mode = CheckpointMode::FullSnapshot;
128    }
129
130    pub fn advance_checkpoints(&mut self, watermark_map: HashMap<u64, u64>) {
131        self.checkpoints = watermark_map.into_iter().collect();
132        if !self.incremental_disabled {
133            self.checkpoint_mode = CheckpointMode::Incremental;
134        }
135    }
136
137    pub fn advance_incremental_checkpoints_with_participation(
138        &mut self,
139        participating_regions: &BTreeSet<u64>,
140        watermark_map: HashMap<u64, u64>,
141    ) {
142        for region_id in participating_regions {
143            if let Some(seq) = watermark_map.get(region_id) {
144                self.checkpoints.insert(*region_id, *seq);
145            }
146        }
147        if !self.incremental_disabled {
148            self.checkpoint_mode = CheckpointMode::Incremental;
149        }
150    }
151
152    pub fn can_advance_full_snapshot_checkpoints(
153        &self,
154        participating_regions: &BTreeSet<u64>,
155        watermark_map: &HashMap<u64, u64>,
156    ) -> bool {
157        !participating_regions.is_empty()
158            && participating_regions.len() == watermark_map.len()
159            && participating_regions
160                .iter()
161                .all(|region_id| watermark_map.contains_key(region_id))
162    }
163
164    pub fn can_advance_incremental_checkpoints_with_participation(
165        &self,
166        participating_regions: &BTreeSet<u64>,
167        watermark_map: &HashMap<u64, u64>,
168    ) -> bool {
169        !self.incremental_disabled
170            && !self.checkpoints.is_empty()
171            && !participating_regions.is_empty()
172            && participating_regions.len() == watermark_map.len()
173            && participating_regions
174                .iter()
175                .all(|region_id| self.checkpoints.contains_key(region_id))
176            && participating_regions.iter().all(|region_id| {
177                let checkpoint = self.checkpoints.get(region_id);
178                watermark_map
179                    .get(region_id)
180                    .zip(checkpoint)
181                    .is_some_and(|(seq, checkpoint)| seq >= checkpoint)
182            })
183    }
184
185    /// Compute the next query delay based on the time window size or the last query duration.
186    /// Aiming to avoid too frequent queries. But also not too long delay.
187    ///
188    /// next wait time is calculated as:
189    /// last query duration, capped by [max(min_run_interval, time_window_size), max_timeout],
190    /// note at most wait for `max_timeout`.
191    ///
192    /// if current the dirty time range is longer than one query can handle,
193    /// execute immediately to faster clean up dirty time windows.
194    ///
195    /// If `prefer_short_incremental_cadence` is true, run incremental queries
196    /// more often when there is no large dirty backlog. This only reduces the
197    /// chance of hitting a stale cursor after flush; it is not required for
198    /// correctness.
199    pub fn get_next_start_query_time(
200        &self,
201        flow_id: FlowId,
202        time_window_size: &Option<Duration>,
203        min_refresh_duration: Duration,
204        max_timeout: Option<Duration>,
205        max_filter_num_per_query: usize,
206        prefer_short_incremental_cadence: bool,
207    ) -> Instant {
208        // = last query duration, capped by [max(min_run_interval, time_window_size), max_timeout], note at most `max_timeout`
209        let lower = time_window_size.unwrap_or(min_refresh_duration);
210        let next_duration = self.last_query_duration.max(lower);
211        let next_duration = if let Some(max_timeout) = max_timeout {
212            next_duration.min(max_timeout)
213        } else {
214            next_duration
215        };
216
217        let cur_dirty_window_size = self.dirty_time_windows.window_size();
218        // compute how much time range can be handled in one query
219        let max_query_update_range = (*time_window_size)
220            .unwrap_or_default()
221            .mul_f64(max_filter_num_per_query as f64);
222        // if dirty time range is more than one query can handle, execute immediately
223        // to faster clean up dirty time windows
224        if cur_dirty_window_size < max_query_update_range {
225            if prefer_short_incremental_cadence {
226                // Run incremental queries sooner than the normal time-window
227                // cadence, while still backing off by at least the previous
228                // query duration and respecting the max-timeout cap.
229                let next_duration = self.last_query_duration.max(min_refresh_duration);
230                let next_duration = if let Some(max_timeout) = max_timeout {
231                    next_duration.min(max_timeout)
232                } else {
233                    next_duration
234                };
235                self.last_update_time + next_duration
236            } else {
237                self.last_update_time + next_duration
238            }
239        } else {
240            // if dirty time windows can't be clean up in one query, execute immediately to faster
241            // clean up dirty time windows
242            debug!(
243                "Flow id = {}, still have too many {} dirty time window({:?}), execute immediately",
244                flow_id,
245                self.dirty_time_windows.windows.len(),
246                self.dirty_time_windows.windows
247            );
248            Instant::now()
249        }
250    }
251}
252
253/// For keep recording of dirty time windows, which is time window that have new data inserted
254/// since last query.
255#[derive(Debug, Clone)]
256pub struct DirtyTimeWindows {
257    /// windows's `start -> end` and non-overlapping
258    /// `end` is exclusive(and optional)
259    windows: BTreeMap<Timestamp, Option<Timestamp>>,
260    /// Maximum number of filters allowed in a single query
261    max_filter_num_per_query: usize,
262    /// Time window merge distance
263    ///
264    time_window_merge_threshold: usize,
265}
266
267impl DirtyTimeWindows {
268    pub fn new(max_filter_num_per_query: usize, time_window_merge_threshold: usize) -> Self {
269        Self {
270            windows: BTreeMap::new(),
271            max_filter_num_per_query,
272            time_window_merge_threshold,
273        }
274    }
275
276    #[cfg(test)]
277    pub(crate) fn max_filter_num_per_query(&self) -> usize {
278        self.max_filter_num_per_query
279    }
280
281    #[cfg(test)]
282    pub(crate) fn time_window_merge_threshold(&self) -> usize {
283        self.time_window_merge_threshold
284    }
285}
286
287impl Default for DirtyTimeWindows {
288    fn default() -> Self {
289        Self {
290            windows: BTreeMap::new(),
291            max_filter_num_per_query: 20,
292            time_window_merge_threshold: 3,
293        }
294    }
295}
296
297impl DirtyTimeWindows {
298    /// Time window merge distance
299    ///
300    /// TODO(discord9): make those configurable
301    pub const MERGE_DIST: i32 = 3;
302
303    /// Add lower bounds to the dirty time windows. Upper bounds are ignored.
304    ///
305    /// # Arguments
306    ///
307    /// * `lower_bounds` - An iterator of lower bounds to be added.
308    pub fn add_lower_bounds(&mut self, lower_bounds: impl Iterator<Item = Timestamp>) {
309        for lower_bound in lower_bounds {
310            let entry = self.windows.entry(lower_bound);
311            entry.or_insert(None);
312        }
313    }
314
315    pub fn window_size(&self) -> Duration {
316        let mut ret = Duration::from_secs(0);
317        for (start, end) in &self.windows {
318            if let Some(end) = end
319                && let Some(duration) = end.sub(start)
320            {
321                ret += duration.to_std().unwrap_or_default();
322            }
323        }
324        ret
325    }
326
327    pub fn add_window(&mut self, start: Timestamp, end: Option<Timestamp>) {
328        self.add_or_merge_window(start, end);
329    }
330
331    pub fn add_windows(&mut self, time_ranges: Vec<(Timestamp, Timestamp)>) {
332        for (start, end) in time_ranges {
333            self.add_or_merge_window(start, Some(end));
334        }
335    }
336
337    /// Add all dirty markers from another dirty-window set.
338    pub fn add_dirty_windows(&mut self, dirty_windows: &DirtyTimeWindows) {
339        for (start, end) in &dirty_windows.windows {
340            self.add_or_merge_window(*start, *end);
341        }
342    }
343
344    fn add_or_merge_window(&mut self, start: Timestamp, end: Option<Timestamp>) {
345        self.windows
346            .entry(start)
347            .and_modify(|current_end| {
348                *current_end = Self::union_window_end(*current_end, end);
349            })
350            .or_insert(end);
351    }
352
353    fn union_window_end(
354        current_end: Option<Timestamp>,
355        incoming_end: Option<Timestamp>,
356    ) -> Option<Timestamp> {
357        match (current_end, incoming_end) {
358            (Some(current), Some(incoming)) => Some(current.max(incoming)),
359            // `None` is a dirty marker without a known upper bound.  When one
360            // side has a concrete end, keep it so merging a restored snapshot
361            // never shrinks an already-known dirty range with the same start.
362            (Some(end), None) | (None, Some(end)) => Some(end),
363            (None, None) => None,
364        }
365    }
366
367    /// Clean all dirty time windows, useful when can't found time window expr
368    pub fn clean(&mut self) {
369        self.windows.clear();
370    }
371
372    /// Set windows to be dirty, only useful for full aggr without time window
373    /// to mark some new data is inserted
374    pub fn set_dirty(&mut self) {
375        self.add_or_merge_window(Timestamp::new_second(0), None);
376    }
377
378    /// Number of dirty windows.
379    pub fn len(&self) -> usize {
380        self.windows.len()
381    }
382
383    pub fn is_empty(&self) -> bool {
384        self.windows.is_empty()
385    }
386
387    /// Get the effective count of time windows, which is the number of time windows that can be
388    /// used for query, compute from total time window range divided by `window_size`.
389    pub fn effective_count(&self, window_size: &Duration) -> usize {
390        if self.windows.is_empty() {
391            return 0;
392        }
393        let window_size =
394            chrono::Duration::from_std(*window_size).unwrap_or(chrono::Duration::zero());
395        let total_window_time_range =
396            self.windows
397                .iter()
398                .fold(chrono::Duration::zero(), |acc, (start, end)| {
399                    if let Some(end) = end {
400                        acc + end.sub(start).unwrap_or(chrono::Duration::zero())
401                    } else {
402                        acc + window_size
403                    }
404                });
405
406        // not sure window_size is zero have any meaning, but just in case
407        if window_size.num_seconds() == 0 {
408            0
409        } else {
410            (total_window_time_range.num_seconds() / window_size.num_seconds()) as usize
411        }
412    }
413
414    /// Generate all filter expressions consuming all time windows
415    ///
416    /// there is two limits:
417    /// - shouldn't return a too long time range(<=`window_size * window_cnt`), so that the query can be executed in a reasonable time
418    /// - shouldn't return too many time range exprs, so that the query can be parsed properly instead of causing parser to overflow
419    pub fn gen_filter_exprs(
420        &mut self,
421        col_name: &str,
422        expire_lower_bound: Option<Timestamp>,
423        window_size: chrono::Duration,
424        window_cnt: usize,
425        flow_id: FlowId,
426        task_ctx: Option<&BatchingTask>,
427    ) -> Result<Option<FilterExprInfo>, Error> {
428        ensure!(
429            window_size.num_seconds() > 0,
430            UnexpectedSnafu {
431                reason: "window_size is zero, can't generate filter exprs",
432            }
433        );
434
435        debug!(
436            "expire_lower_bound: {:?}, window_size: {:?}",
437            expire_lower_bound.map(|t| t.to_iso8601_string()),
438            window_size
439        );
440        self.merge_dirty_time_windows(window_size, expire_lower_bound)?;
441
442        if self.windows.len() > window_cnt {
443            let first_time_window = self.windows.first_key_value();
444            let last_time_window = self.windows.last_key_value();
445
446            if let Some(task_ctx) = task_ctx {
447                warn!(
448                    "Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. Time window expr={:?}, expire_after={:?}, first_time_window={:?}, last_time_window={:?}, the original query: {:?}",
449                    task_ctx.config.flow_id,
450                    self.windows.len(),
451                    window_cnt,
452                    task_ctx.config.time_window_expr,
453                    task_ctx.config.expire_after,
454                    first_time_window,
455                    last_time_window,
456                    task_ctx.config.query
457                );
458            } else {
459                warn!(
460                    "Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. first_time_window={:?}, last_time_window={:?}",
461                    flow_id,
462                    self.windows.len(),
463                    window_cnt,
464                    first_time_window,
465                    last_time_window
466                )
467            }
468        }
469
470        // get the first `window_cnt` time windows
471        let max_time_range = window_size * window_cnt as i32;
472
473        let mut to_be_query = BTreeMap::new();
474        let mut new_windows = self.windows.clone();
475        let mut cur_time_range = chrono::Duration::zero();
476        for (idx, (start, end)) in self.windows.iter().enumerate() {
477            let first_end = start
478                .add_duration(window_size.to_std().unwrap())
479                .context(TimeSnafu)?;
480            let end = end.unwrap_or(first_end);
481
482            // if time range is too long, stop
483            if cur_time_range >= max_time_range {
484                break;
485            }
486
487            // if we have enough time windows, stop
488            if idx >= window_cnt {
489                break;
490            }
491
492            let Some(x) = end.sub(start) else {
493                continue;
494            };
495            if cur_time_range + x <= max_time_range {
496                to_be_query.insert(*start, Some(end));
497                new_windows.remove(start);
498                cur_time_range += x;
499            } else {
500                // too large a window, split it
501                // split at window_size * times
502                let surplus = max_time_range - cur_time_range;
503                if surplus.num_seconds() <= window_size.num_seconds() {
504                    // Skip splitting if surplus is smaller than window_size
505                    break;
506                }
507                let times = surplus.num_seconds() / window_size.num_seconds();
508
509                let split_offset = window_size * times as i32;
510                let split_at = start
511                    .add_duration(split_offset.to_std().unwrap())
512                    .context(TimeSnafu)?;
513                to_be_query.insert(*start, Some(split_at));
514
515                // remove the original window
516                new_windows.remove(start);
517                new_windows.insert(split_at, Some(end));
518                cur_time_range += split_offset;
519                break;
520            }
521        }
522
523        self.windows = new_windows;
524
525        METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_CNT
526            .with_label_values(&[flow_id.to_string().as_str()])
527            .observe(to_be_query.len() as f64);
528
529        let full_time_range = to_be_query
530            .iter()
531            .fold(chrono::Duration::zero(), |acc, (start, end)| {
532                if let Some(end) = end {
533                    acc + end.sub(start).unwrap_or(chrono::Duration::zero())
534                } else {
535                    acc + window_size
536                }
537            })
538            .num_seconds() as f64;
539        METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_SIZE
540            .with_label_values(&[flow_id.to_string().as_str()])
541            .observe(full_time_range);
542
543        let stalled_time_range =
544            self.windows
545                .iter()
546                .fold(chrono::Duration::zero(), |acc, (start, end)| {
547                    if let Some(end) = end {
548                        acc + end.sub(start).unwrap_or(chrono::Duration::zero())
549                    } else {
550                        acc + window_size
551                    }
552                });
553
554        METRIC_FLOW_BATCHING_ENGINE_STALLED_WINDOW_SIZE
555            .with_label_values(&[flow_id.to_string().as_str()])
556            .observe(stalled_time_range.num_seconds() as f64);
557
558        let std_window_size = window_size.to_std().map_err(|e| {
559            InternalSnafu {
560                reason: e.to_string(),
561            }
562            .build()
563        })?;
564
565        let mut expr_lst = vec![];
566        let mut time_ranges = vec![];
567        for (start, end) in to_be_query.into_iter() {
568            // align using time window exprs
569            let (start, end) = if let Some(ctx) = task_ctx {
570                let Some(time_window_expr) = &ctx.config.time_window_expr else {
571                    UnexpectedSnafu {
572                        reason: "time_window_expr is not set",
573                    }
574                    .fail()?
575                };
576                self.align_time_window(start, end, time_window_expr)?
577            } else {
578                (start, end)
579            };
580            let end = end.unwrap_or(start.add_duration(std_window_size).context(TimeSnafu)?);
581            time_ranges.push((start, end));
582
583            debug!(
584                "Time window start: {:?}, end: {:?}",
585                start.to_iso8601_string(),
586                end.to_iso8601_string()
587            );
588
589            use datafusion_expr::{col, lit};
590            let lower = to_df_literal(start)?;
591            let upper = to_df_literal(end)?;
592            let expr = col(col_name)
593                .gt_eq(lit(lower))
594                .and(col(col_name).lt(lit(upper)));
595            expr_lst.push(expr);
596        }
597        let expr = expr_lst.into_iter().reduce(|a, b| a.or(b));
598        let ret = expr.map(|expr| FilterExprInfo {
599            expr,
600            col_name: col_name.to_string(),
601            time_ranges,
602            window_size,
603        });
604        Ok(ret)
605    }
606
607    fn align_time_window(
608        &self,
609        start: Timestamp,
610        end: Option<Timestamp>,
611        time_window_expr: &TimeWindowExpr,
612    ) -> Result<(Timestamp, Option<Timestamp>), Error> {
613        let align_start = time_window_expr.eval(start)?.0.context(UnexpectedSnafu {
614            reason: format!(
615                "Failed to align start time {:?} with time window expr {:?}",
616                start, time_window_expr
617            ),
618        })?;
619        let align_end = end
620            .and_then(|end| {
621                time_window_expr
622                    .eval(end)
623                    // if after aligned, end is the same, then use end(because it's already aligned) else use aligned end
624                    .map(|r| if r.0 == Some(end) { r.0 } else { r.1 })
625                    .transpose()
626            })
627            .transpose()?;
628        Ok((align_start, align_end))
629    }
630
631    /// Merge time windows that overlaps or get too close
632    ///
633    /// TODO(discord9): not merge and prefer to send smaller time windows? how?
634    pub fn merge_dirty_time_windows(
635        &mut self,
636        window_size: chrono::Duration,
637        expire_lower_bound: Option<Timestamp>,
638    ) -> Result<(), Error> {
639        if self.windows.is_empty() {
640            return Ok(());
641        }
642
643        let mut new_windows = BTreeMap::new();
644
645        let std_window_size = window_size.to_std().map_err(|e| {
646            InternalSnafu {
647                reason: e.to_string(),
648            }
649            .build()
650        })?;
651
652        // previous time window
653        let mut prev_tw = None;
654        for (lower_bound, upper_bound) in std::mem::take(&mut self.windows) {
655            // filter out expired time window
656            if let Some(expire_lower_bound) = expire_lower_bound
657                && lower_bound < expire_lower_bound
658            {
659                continue;
660            }
661
662            let Some(prev_tw) = &mut prev_tw else {
663                prev_tw = Some((lower_bound, upper_bound));
664                continue;
665            };
666
667            // if cur.lower - prev.upper <= window_size * MERGE_DIST, merge
668            // this also deal with overlap windows because cur.lower > prev.lower is always true
669            let prev_upper = prev_tw
670                .1
671                .unwrap_or(prev_tw.0.add_duration(std_window_size).context(TimeSnafu)?);
672            prev_tw.1 = Some(prev_upper);
673
674            let cur_upper = upper_bound.unwrap_or(
675                lower_bound
676                    .add_duration(std_window_size)
677                    .context(TimeSnafu)?,
678            );
679
680            if lower_bound
681                .sub(&prev_upper)
682                .map(|dist| dist <= window_size * self.time_window_merge_threshold as i32)
683                .unwrap_or(false)
684            {
685                prev_tw.1 = Some(cur_upper);
686            } else {
687                new_windows.insert(prev_tw.0, prev_tw.1);
688                *prev_tw = (lower_bound, Some(cur_upper));
689            }
690        }
691
692        if let Some(prev_tw) = prev_tw {
693            new_windows.insert(prev_tw.0, prev_tw.1);
694        }
695
696        self.windows = new_windows;
697
698        Ok(())
699    }
700}
701
702pub(crate) fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
703    let value = Value::from(value);
704    let value = value
705        .try_to_scalar_value(&value.data_type())
706        .with_context(|_| DatatypesSnafu {
707            extra: format!("Failed to convert to scalar value: {}", value),
708        })?;
709    Ok(value)
710}
711
712#[derive(Debug, Clone)]
713enum ExecState {
714    Idle,
715    Executing,
716}
717
718#[derive(Debug, Clone, Copy, PartialEq, Eq)]
719pub enum CheckpointMode {
720    FullSnapshot,
721    Incremental,
722}
723
724/// Filter Expression's information
725#[derive(Debug, Clone)]
726pub struct FilterExprInfo {
727    pub expr: datafusion_expr::Expr,
728    pub col_name: String,
729    pub time_ranges: Vec<(Timestamp, Timestamp)>,
730    pub window_size: chrono::Duration,
731}
732
733impl FilterExprInfo {
734    pub fn total_window_length(&self) -> chrono::Duration {
735        self.time_ranges
736            .iter()
737            .fold(chrono::Duration::zero(), |acc, (start, end)| {
738                acc + end.sub(start).unwrap_or(chrono::Duration::zero())
739            })
740    }
741
742    pub fn predicate_for_col(
743        &self,
744        col_name: &str,
745    ) -> Result<Option<datafusion_expr::Expr>, Error> {
746        use datafusion_common::Column;
747        use datafusion_expr::{Expr, lit};
748
749        let mut expr_lst = Vec::with_capacity(self.time_ranges.len());
750        for (start, end) in &self.time_ranges {
751            let lower = to_df_literal(*start)?;
752            let upper = to_df_literal(*end)?;
753            let filter_col = || Expr::Column(Column::new_unqualified(col_name));
754            expr_lst.push(
755                filter_col()
756                    .gt_eq(lit(lower))
757                    .and(filter_col().lt(lit(upper))),
758            );
759        }
760
761        Ok(expr_lst.into_iter().reduce(|a, b| a.or(b)))
762    }
763}
764
765#[cfg(test)]
766mod test {
767    use pretty_assertions::assert_eq;
768    use session::context::QueryContext;
769
770    use super::*;
771    use crate::batching_mode::time_window::find_time_window_expr;
772    use crate::batching_mode::utils::sql_to_df_plan;
773    use crate::test_utils::create_test_query_engine;
774
775    #[test]
776    fn test_task_state_records_last_execution_time() {
777        let query_ctx = QueryContext::arc();
778        let (_tx, rx) = tokio::sync::oneshot::channel();
779        let mut state = TaskState::new(query_ctx, rx);
780
781        assert_eq!(None, state.last_execution_time_millis());
782        state.after_query_exec(std::time::Duration::from_millis(1), false);
783        assert_eq!(None, state.last_execution_time_millis());
784
785        state.after_query_exec(std::time::Duration::from_millis(1), true);
786        assert!(state.last_execution_time_millis().is_some());
787    }
788
789    #[test]
790    fn test_merge_dirty_time_windows() {
791        let merge_dist = DirtyTimeWindows::default().time_window_merge_threshold;
792        let testcases = vec![
793            // just enough to merge
794            (
795                vec![
796                    Timestamp::new_second(0),
797                    Timestamp::new_second((1 + merge_dist as i64) * 5 * 60),
798                ],
799                (chrono::Duration::seconds(5 * 60), None),
800                BTreeMap::from([(
801                    Timestamp::new_second(0),
802                    Some(Timestamp::new_second((2 + merge_dist as i64) * 5 * 60)),
803                )]),
804                Some(
805                    "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:25:00' AS TIMESTAMP)))",
806                ),
807            ),
808            // separate time window
809            (
810                vec![
811                    Timestamp::new_second(0),
812                    Timestamp::new_second((2 + merge_dist as i64) * 5 * 60),
813                ],
814                (chrono::Duration::seconds(5 * 60), None),
815                BTreeMap::from([
816                    (
817                        Timestamp::new_second(0),
818                        Some(Timestamp::new_second(5 * 60)),
819                    ),
820                    (
821                        Timestamp::new_second((2 + merge_dist as i64) * 5 * 60),
822                        Some(Timestamp::new_second((3 + merge_dist as i64) * 5 * 60)),
823                    ),
824                ]),
825                Some(
826                    "(((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:05:00' AS TIMESTAMP))) OR ((ts >= CAST('1970-01-01 00:25:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:30:00' AS TIMESTAMP))))",
827                ),
828            ),
829            // overlapping
830            (
831                vec![
832                    Timestamp::new_second(0),
833                    Timestamp::new_second((merge_dist as i64) * 5 * 60),
834                ],
835                (chrono::Duration::seconds(5 * 60), None),
836                BTreeMap::from([(
837                    Timestamp::new_second(0),
838                    Some(Timestamp::new_second((1 + merge_dist as i64) * 5 * 60)),
839                )]),
840                Some(
841                    "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:20:00' AS TIMESTAMP)))",
842                ),
843            ),
844            // complex overlapping
845            (
846                vec![
847                    Timestamp::new_second(0),
848                    Timestamp::new_second((merge_dist as i64) * 3),
849                    Timestamp::new_second((merge_dist as i64) * 3 * 2),
850                ],
851                (chrono::Duration::seconds(3), None),
852                BTreeMap::from([(
853                    Timestamp::new_second(0),
854                    Some(Timestamp::new_second((merge_dist as i64) * 7)),
855                )]),
856                Some(
857                    "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:00:21' AS TIMESTAMP)))",
858                ),
859            ),
860            // split range
861            (
862                Vec::from_iter((0..20).map(|i| Timestamp::new_second(i * 3)).chain(
863                    std::iter::once(Timestamp::new_second(
864                        60 + 3 * (DirtyTimeWindows::MERGE_DIST as i64 + 1),
865                    )),
866                )),
867                (chrono::Duration::seconds(3), None),
868                BTreeMap::from([
869                    (Timestamp::new_second(0), Some(Timestamp::new_second(60))),
870                    (
871                        Timestamp::new_second(60 + 3 * (DirtyTimeWindows::MERGE_DIST as i64 + 1)),
872                        Some(Timestamp::new_second(
873                            60 + 3 * (DirtyTimeWindows::MERGE_DIST as i64 + 1) + 3,
874                        )),
875                    ),
876                ]),
877                Some(
878                    "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:01:00' AS TIMESTAMP)))",
879                ),
880            ),
881            // split 2 min into 1 min
882            (
883                Vec::from_iter((0..40).map(|i| Timestamp::new_second(i * 3))),
884                (chrono::Duration::seconds(3), None),
885                BTreeMap::from([(
886                    Timestamp::new_second(0),
887                    Some(Timestamp::new_second(40 * 3)),
888                )]),
889                Some(
890                    "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:01:00' AS TIMESTAMP)))",
891                ),
892            ),
893            // split 3s + 1min into 3s + 57s
894            (
895                Vec::from_iter(
896                    std::iter::once(Timestamp::new_second(0))
897                        .chain((0..40).map(|i| Timestamp::new_second(20 + i * 3))),
898                ),
899                (chrono::Duration::seconds(3), None),
900                BTreeMap::from([
901                    (Timestamp::new_second(0), Some(Timestamp::new_second(3))),
902                    (Timestamp::new_second(20), Some(Timestamp::new_second(140))),
903                ]),
904                Some(
905                    "(((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:00:03' AS TIMESTAMP))) OR ((ts >= CAST('1970-01-01 00:00:20' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:01:17' AS TIMESTAMP))))",
906                ),
907            ),
908            // expired
909            (
910                vec![
911                    Timestamp::new_second(0),
912                    Timestamp::new_second((merge_dist as i64) * 5 * 60),
913                ],
914                (
915                    chrono::Duration::seconds(5 * 60),
916                    Some(Timestamp::new_second((merge_dist as i64) * 6 * 60)),
917                ),
918                BTreeMap::from([]),
919                None,
920            ),
921        ];
922        // let len = testcases.len();
923        // let testcases = testcases[(len - 2)..(len - 1)].to_vec();
924        for (lower_bounds, (window_size, expire_lower_bound), expected, expected_filter_expr) in
925            testcases
926        {
927            let mut dirty = DirtyTimeWindows::default();
928            dirty.add_lower_bounds(lower_bounds.into_iter());
929            dirty
930                .merge_dirty_time_windows(window_size, expire_lower_bound)
931                .unwrap();
932            assert_eq!(expected, dirty.windows);
933            let filter_expr = dirty
934                .gen_filter_exprs(
935                    "ts",
936                    expire_lower_bound,
937                    window_size,
938                    dirty.max_filter_num_per_query,
939                    0,
940                    None,
941                )
942                .unwrap()
943                .map(|e| e.expr);
944
945            let unparser = datafusion::sql::unparser::Unparser::default();
946            let to_sql = filter_expr
947                .as_ref()
948                .map(|e| unparser.expr_to_sql(e).unwrap().to_string());
949            assert_eq!(expected_filter_expr, to_sql.as_deref());
950        }
951    }
952
953    #[tokio::test]
954    async fn test_align_time_window() {
955        type TimeWindow = (Timestamp, Option<Timestamp>);
956        struct TestCase {
957            sql: String,
958            aligns: Vec<(TimeWindow, TimeWindow)>,
959        }
960        let testcases: Vec<TestCase> = vec![TestCase{
961            sql: "SELECT date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window;".to_string(),
962            aligns: vec![
963                ((Timestamp::new_second(3), None), (Timestamp::new_second(0), None)),
964                ((Timestamp::new_second(8), None), (Timestamp::new_second(5), None)),
965                ((Timestamp::new_second(8), Some(Timestamp::new_second(10))), (Timestamp::new_second(5), Some(Timestamp::new_second(10)))),
966                ((Timestamp::new_second(8), Some(Timestamp::new_second(9))), (Timestamp::new_second(5), Some(Timestamp::new_second(10)))),
967            ],
968        }];
969
970        let query_engine = create_test_query_engine();
971        let ctx = QueryContext::arc();
972        for TestCase { sql, aligns } in testcases {
973            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), &sql, true)
974                .await
975                .unwrap();
976
977            let (column_name, time_window_expr, _, df_schema) = find_time_window_expr(
978                &plan,
979                query_engine.engine_state().catalog_manager().clone(),
980                ctx.clone(),
981            )
982            .await
983            .unwrap();
984
985            let time_window_expr = time_window_expr
986                .map(|expr| {
987                    TimeWindowExpr::from_expr(
988                        &expr,
989                        &column_name,
990                        &df_schema,
991                        &query_engine.engine_state().session_state(),
992                    )
993                })
994                .transpose()
995                .unwrap()
996                .unwrap();
997
998            let dirty = DirtyTimeWindows::default();
999            for (before_align, expected_after_align) in aligns {
1000                let after_align = dirty
1001                    .align_time_window(before_align.0, before_align.1, &time_window_expr)
1002                    .unwrap();
1003                assert_eq!(expected_after_align, after_align);
1004            }
1005        }
1006    }
1007
1008    #[test]
1009    fn test_task_state_checkpoint_mode_and_advancement() {
1010        let query_ctx = QueryContext::arc();
1011        let (_tx, rx) = tokio::sync::oneshot::channel();
1012        let mut state = TaskState::new(query_ctx, rx);
1013
1014        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1015        assert!(state.checkpoints().is_empty());
1016
1017        state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
1018        assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
1019        assert_eq!(
1020            state.checkpoints(),
1021            &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
1022        );
1023
1024        state.mark_full_snapshot();
1025        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1026        assert_eq!(
1027            state.checkpoints(),
1028            &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
1029        );
1030    }
1031
1032    #[test]
1033    fn test_disable_incremental_persists_full_snapshot_mode() {
1034        let query_ctx = QueryContext::arc();
1035        let (_tx, rx) = tokio::sync::oneshot::channel();
1036        let mut state = TaskState::new(query_ctx, rx);
1037
1038        assert!(!state.is_incremental_disabled());
1039
1040        // After disable, mode becomes FullSnapshot and flag is set.
1041        state.disable_incremental();
1042        assert!(state.is_incremental_disabled());
1043        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1044
1045        // `advance_checkpoints` will NOT transition to Incremental when disabled.
1046        state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
1047        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1048        assert_eq!(
1049            state.checkpoints(),
1050            &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
1051        );
1052
1053        // `mark_full_snapshot` does not re-enable incremental.
1054        state.mark_full_snapshot();
1055        assert!(state.is_incremental_disabled());
1056        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1057    }
1058
1059    #[test]
1060    fn test_full_snapshot_checkpoint_advancement_requires_participating_regions() {
1061        let query_ctx = QueryContext::arc();
1062        let (_tx, rx) = tokio::sync::oneshot::channel();
1063        let state = TaskState::new(query_ctx, rx);
1064
1065        assert!(!state.can_advance_full_snapshot_checkpoints(&BTreeSet::new(), &HashMap::new()));
1066        assert!(!state.can_advance_full_snapshot_checkpoints(
1067            &BTreeSet::from([1_u64, 2_u64]),
1068            &HashMap::from([(1_u64, 10_u64)]),
1069        ));
1070        assert!(state.can_advance_full_snapshot_checkpoints(
1071            &BTreeSet::from([1_u64, 2_u64]),
1072            &HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]),
1073        ));
1074    }
1075
1076    #[test]
1077    fn test_incremental_checkpoint_advancement_requires_participation_alignment() {
1078        let query_ctx = QueryContext::arc();
1079        let (_tx, rx) = tokio::sync::oneshot::channel();
1080        let mut state = TaskState::new(query_ctx, rx);
1081        state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
1082
1083        assert!(
1084            state.can_advance_incremental_checkpoints_with_participation(
1085                &BTreeSet::from([1_u64]),
1086                &HashMap::from([(1_u64, 11_u64)]),
1087            )
1088        );
1089        assert!(
1090            !state.can_advance_incremental_checkpoints_with_participation(
1091                &BTreeSet::from([1_u64, 2_u64]),
1092                &HashMap::from([(1_u64, 11_u64)]),
1093            )
1094        );
1095        assert!(
1096            !state.can_advance_incremental_checkpoints_with_participation(
1097                &BTreeSet::from([3_u64]),
1098                &HashMap::from([(3_u64, 11_u64)]),
1099            )
1100        );
1101        assert!(
1102            !state.can_advance_incremental_checkpoints_with_participation(
1103                &BTreeSet::from([1_u64]),
1104                &HashMap::from([(1_u64, 9_u64)]),
1105            )
1106        );
1107        assert!(
1108            state.can_advance_incremental_checkpoints_with_participation(
1109                &BTreeSet::from([1_u64, 2_u64]),
1110                &HashMap::from([(1_u64, 11_u64), (2_u64, 21_u64)]),
1111            )
1112        );
1113
1114        state.disable_incremental();
1115        assert!(
1116            !state.can_advance_incremental_checkpoints_with_participation(
1117                &BTreeSet::from([1_u64, 2_u64]),
1118                &HashMap::from([(1_u64, 12_u64), (2_u64, 22_u64)]),
1119            )
1120        );
1121    }
1122
1123    #[test]
1124    fn test_incremental_checkpoint_advancement_merges_participating_subset() {
1125        let query_ctx = QueryContext::arc();
1126        let (_tx, rx) = tokio::sync::oneshot::channel();
1127        let mut state = TaskState::new(query_ctx, rx);
1128        state.advance_checkpoints(HashMap::from([
1129            (1_u64, 10_u64),
1130            (2_u64, 20_u64),
1131            (3_u64, 30_u64),
1132        ]));
1133
1134        state.advance_incremental_checkpoints_with_participation(
1135            &BTreeSet::from([1_u64, 3_u64]),
1136            HashMap::from([(1_u64, 12_u64), (3_u64, 35_u64)]),
1137        );
1138
1139        assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
1140        assert_eq!(
1141            state.checkpoints(),
1142            &BTreeMap::from([(1_u64, 12_u64), (2_u64, 20_u64), (3_u64, 35_u64)])
1143        );
1144    }
1145
1146    #[test]
1147    fn test_filter_expr_info_predicate_for_col_empty_ranges() {
1148        let filter = FilterExprInfo {
1149            expr: datafusion_expr::col("ts"),
1150            col_name: "ts".to_string(),
1151            time_ranges: vec![],
1152            window_size: chrono::Duration::seconds(1),
1153        };
1154
1155        assert!(filter.predicate_for_col("time_window").unwrap().is_none());
1156    }
1157
1158    #[test]
1159    fn test_filter_expr_info_predicate_for_col_single_range() {
1160        let filter = FilterExprInfo {
1161            expr: datafusion_expr::col("ts"),
1162            col_name: "ts".to_string(),
1163            time_ranges: vec![(Timestamp::new_second(0), Timestamp::new_second(1))],
1164            window_size: chrono::Duration::seconds(1),
1165        };
1166
1167        let predicate = filter.predicate_for_col("time_window").unwrap().unwrap();
1168        let unparser = datafusion::sql::unparser::Unparser::default();
1169        assert_eq!(
1170            "((time_window >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:01' AS TIMESTAMP)))",
1171            unparser.expr_to_sql(&predicate).unwrap().to_string()
1172        );
1173    }
1174
1175    #[test]
1176    fn test_filter_expr_info_predicate_for_col_multiple_ranges() {
1177        let filter = FilterExprInfo {
1178            expr: datafusion_expr::col("ts"),
1179            col_name: "ts".to_string(),
1180            time_ranges: vec![
1181                (Timestamp::new_second(0), Timestamp::new_second(1)),
1182                (Timestamp::new_second(10), Timestamp::new_second(11)),
1183            ],
1184            window_size: chrono::Duration::seconds(1),
1185        };
1186
1187        let predicate = filter.predicate_for_col("time_window").unwrap().unwrap();
1188        let unparser = datafusion::sql::unparser::Unparser::default();
1189        assert_eq!(
1190            "(((time_window >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:01' AS TIMESTAMP))) OR ((time_window >= CAST('1970-01-01 00:00:10' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:11' AS TIMESTAMP))))",
1191            unparser.expr_to_sql(&predicate).unwrap().to_string()
1192        );
1193    }
1194
1195    /// Helper: create a `TaskState` whose `last_update_time` is a known duration in the past.
1196    fn state_with_past_update(age: Duration) -> TaskState {
1197        let query_ctx = QueryContext::arc();
1198        let (_tx, rx) = tokio::sync::oneshot::channel();
1199        let mut state = TaskState::new(query_ctx, rx);
1200        state.last_update_time = Instant::now() - age;
1201        state
1202    }
1203
1204    #[test]
1205    fn test_short_incremental_cadence_uses_min_refresh() {
1206        // When prefer_short_incremental_cadence is true and dirty backlog is manageable,
1207        // the next start time should be last_update_time + min_refresh (short cadence),
1208        // ignoring the longer time_window_size.
1209        let state = state_with_past_update(Duration::from_secs(10));
1210
1211        let time_window_size = Some(Duration::from_secs(60)); // large window
1212        let min_refresh = Duration::from_secs(5);
1213        let flow_id = 1;
1214
1215        let result = state.get_next_start_query_time(
1216            flow_id,
1217            &time_window_size,
1218            min_refresh,
1219            None,
1220            20,
1221            true, // prefer_short_incremental_cadence
1222        );
1223
1224        // With short cadence, result should be last_update_time + min_refresh.
1225        let expected = state.last_update_time + min_refresh;
1226        assert_eq!(result, expected);
1227    }
1228
1229    #[test]
1230    fn test_short_incremental_cadence_respects_last_query_duration() {
1231        let mut state = state_with_past_update(Duration::from_secs(10));
1232        state.last_query_duration = Duration::from_secs(20);
1233
1234        let time_window_size = Some(Duration::from_secs(60));
1235        let min_refresh = Duration::from_secs(5);
1236        let flow_id = 1;
1237
1238        let result = state.get_next_start_query_time(
1239            flow_id,
1240            &time_window_size,
1241            min_refresh,
1242            None,
1243            20,
1244            true,
1245        );
1246
1247        assert_eq!(result, state.last_update_time + state.last_query_duration);
1248    }
1249
1250    #[test]
1251    fn test_short_incremental_cadence_respects_max_timeout() {
1252        let mut state = state_with_past_update(Duration::from_secs(10));
1253        state.last_query_duration = Duration::from_secs(20);
1254
1255        let time_window_size = Some(Duration::from_secs(60));
1256        let min_refresh = Duration::from_secs(30);
1257        let max_timeout = Duration::from_secs(5);
1258        let flow_id = 1;
1259
1260        let result = state.get_next_start_query_time(
1261            flow_id,
1262            &time_window_size,
1263            min_refresh,
1264            Some(max_timeout),
1265            20,
1266            true,
1267        );
1268
1269        assert_eq!(result, state.last_update_time + max_timeout);
1270    }
1271
1272    #[test]
1273    fn test_full_snapshot_ignores_short_cadence() {
1274        // When prefer_short_incremental_cadence is false (full snapshot mode),
1275        // the normal long-cadence based on time_window_size applies.
1276        let mut state = state_with_past_update(Duration::from_secs(10));
1277        // Make last_query_duration small so the lower bound (time_window_size) dominates.
1278        state.last_query_duration = Duration::from_secs(1);
1279
1280        let time_window_size = Some(Duration::from_secs(60)); // large window
1281        let min_refresh = Duration::from_secs(5);
1282        let flow_id = 1;
1283
1284        let result = state.get_next_start_query_time(
1285            flow_id,
1286            &time_window_size,
1287            min_refresh,
1288            None,
1289            20,
1290            false, // prefer_short_incremental_cadence = false
1291        );
1292
1293        // With normal cadence, result should be last_update_time + time_window_size
1294        // (since last_query_duration < time_window_size).
1295        let expected = state.last_update_time + Duration::from_secs(60);
1296        assert_eq!(result, expected);
1297    }
1298
1299    #[test]
1300    fn test_dirty_window_overflow_schedules_immediately_even_with_short_cadence() {
1301        // Dirty-window overflow must always schedule immediately,
1302        // regardless of prefer_short_incremental_cadence.
1303        let mut state = state_with_past_update(Duration::from_secs(10));
1304        // Create a very large dirty backlog.
1305        state
1306            .dirty_time_windows
1307            .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(3600)));
1308
1309        let time_window_size = Some(Duration::from_secs(1)); // tiny window => overflow
1310        let min_refresh = Duration::from_secs(5);
1311        let flow_id = 1;
1312
1313        // With short cadence flag.
1314        let result = state.get_next_start_query_time(
1315            flow_id,
1316            &time_window_size,
1317            min_refresh,
1318            None,
1319            1, // max 1 filter => tiny capacity
1320            true,
1321        );
1322        assert!(
1323            result <= Instant::now(),
1324            "dirty overflow should schedule immediately"
1325        );
1326
1327        // Without short cadence flag — same behavior.
1328        let result2 = state.get_next_start_query_time(
1329            flow_id,
1330            &time_window_size,
1331            min_refresh,
1332            None,
1333            1,
1334            false,
1335        );
1336        assert!(
1337            result2 <= Instant::now(),
1338            "dirty overflow should schedule immediately"
1339        );
1340    }
1341
1342    #[test]
1343    fn test_incremental_disabled_ignores_short_cadence() {
1344        // When prefer_short_incremental_cadence is true but the dirty backlog is
1345        // manageable, the short cadence is applied. This test verifies that the
1346        // caller-side guard (checkpoint_mode + !is_incremental_disabled) controls
1347        // whether short cadence is requested at all — when incremental is disabled,
1348        // the flag is false, and the long cadence applies.
1349        //
1350        // This simulates the case where the caller computed
1351        // prefer_short_incremental_cadence = false (e.g. incremental disabled
1352        // or FullSnapshot mode), so the long cadence is used.
1353        let mut state = state_with_past_update(Duration::from_secs(10));
1354        state.last_query_duration = Duration::from_secs(1);
1355
1356        let time_window_size = Some(Duration::from_secs(60));
1357        let min_refresh = Duration::from_secs(5);
1358        let flow_id = 1;
1359
1360        let result = state.get_next_start_query_time(
1361            flow_id,
1362            &time_window_size,
1363            min_refresh,
1364            None,
1365            20,
1366            false, // prefer_short_incremental_cadence = false
1367        );
1368
1369        // With normal cadence, result should be last_update_time + time_window_size.
1370        let expected = state.last_update_time + Duration::from_secs(60);
1371        assert_eq!(result, expected);
1372    }
1373}