1use std::collections::{BTreeMap, BTreeSet, HashMap};
19use std::time::Duration;
20
21use common_telemetry::debug;
22use common_telemetry::tracing::warn;
23use common_time::Timestamp;
24use datatypes::value::Value;
25use session::context::QueryContextRef;
26use snafu::{OptionExt, ResultExt, ensure};
27use tokio::sync::oneshot;
28use tokio::time::Instant;
29
30use crate::batching_mode::task::BatchingTask;
31use crate::batching_mode::time_window::TimeWindowExpr;
32use crate::error::{DatatypesSnafu, InternalSnafu, TimeSnafu, UnexpectedSnafu};
33use crate::metrics::{
34 METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_CNT, METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_SIZE,
35 METRIC_FLOW_BATCHING_ENGINE_STALLED_WINDOW_SIZE,
36};
37use crate::{Error, FlowId};
38
39#[derive(Debug)]
41pub struct TaskState {
42 pub(crate) query_ctx: QueryContextRef,
44 last_update_time: Instant,
46 last_query_duration: Duration,
48 last_exec_time_millis: Option<i64>,
50 pub(crate) dirty_time_windows: DirtyTimeWindows,
53 checkpoint_mode: CheckpointMode,
54 checkpoints: BTreeMap<u64, u64>,
57 incremental_disabled: bool,
61 exec_state: ExecState,
62 pub(crate) shutdown_rx: oneshot::Receiver<()>,
64 pub(crate) task_handle: Option<tokio::task::JoinHandle<()>>,
66}
67impl TaskState {
68 pub fn new(query_ctx: QueryContextRef, shutdown_rx: oneshot::Receiver<()>) -> Self {
69 Self::with_dirty_time_windows(query_ctx, shutdown_rx, DirtyTimeWindows::default())
70 }
71
72 pub fn with_dirty_time_windows(
73 query_ctx: QueryContextRef,
74 shutdown_rx: oneshot::Receiver<()>,
75 dirty_time_windows: DirtyTimeWindows,
76 ) -> Self {
77 Self {
78 query_ctx,
79 last_update_time: Instant::now(),
80 last_query_duration: Duration::from_secs(0),
81 last_exec_time_millis: None,
82 dirty_time_windows,
83 checkpoint_mode: CheckpointMode::FullSnapshot,
84 checkpoints: Default::default(),
85 incremental_disabled: false,
86 exec_state: ExecState::Idle,
87 shutdown_rx,
88 task_handle: None,
89 }
90 }
91
92 pub fn after_query_exec(&mut self, elapsed: Duration, is_succ: bool) {
95 self.exec_state = ExecState::Idle;
96 self.last_query_duration = elapsed;
97 self.last_update_time = Instant::now();
98 if is_succ {
99 self.last_exec_time_millis = Some(common_time::util::current_time_millis());
100 }
101 }
102
103 pub fn last_execution_time_millis(&self) -> Option<i64> {
104 self.last_exec_time_millis
105 }
106
107 pub fn checkpoint_mode(&self) -> CheckpointMode {
108 self.checkpoint_mode
109 }
110
111 pub fn checkpoints(&self) -> &BTreeMap<u64, u64> {
112 &self.checkpoints
113 }
114
115 pub fn is_incremental_disabled(&self) -> bool {
116 self.incremental_disabled
117 }
118
119 pub fn disable_incremental(&mut self) {
122 self.incremental_disabled = true;
123 self.mark_full_snapshot();
124 }
125
126 pub fn mark_full_snapshot(&mut self) {
127 self.checkpoint_mode = CheckpointMode::FullSnapshot;
128 }
129
130 pub fn advance_checkpoints(&mut self, watermark_map: HashMap<u64, u64>) {
131 self.checkpoints = watermark_map.into_iter().collect();
132 if !self.incremental_disabled {
133 self.checkpoint_mode = CheckpointMode::Incremental;
134 }
135 }
136
137 pub fn advance_incremental_checkpoints_with_participation(
138 &mut self,
139 participating_regions: &BTreeSet<u64>,
140 watermark_map: HashMap<u64, u64>,
141 ) {
142 for region_id in participating_regions {
143 if let Some(seq) = watermark_map.get(region_id) {
144 self.checkpoints.insert(*region_id, *seq);
145 }
146 }
147 if !self.incremental_disabled {
148 self.checkpoint_mode = CheckpointMode::Incremental;
149 }
150 }
151
152 pub fn can_advance_full_snapshot_checkpoints(
153 &self,
154 participating_regions: &BTreeSet<u64>,
155 watermark_map: &HashMap<u64, u64>,
156 ) -> bool {
157 !participating_regions.is_empty()
158 && participating_regions.len() == watermark_map.len()
159 && participating_regions
160 .iter()
161 .all(|region_id| watermark_map.contains_key(region_id))
162 }
163
164 pub fn can_advance_incremental_checkpoints_with_participation(
165 &self,
166 participating_regions: &BTreeSet<u64>,
167 watermark_map: &HashMap<u64, u64>,
168 ) -> bool {
169 !self.incremental_disabled
170 && !self.checkpoints.is_empty()
171 && !participating_regions.is_empty()
172 && participating_regions.len() == watermark_map.len()
173 && participating_regions
174 .iter()
175 .all(|region_id| self.checkpoints.contains_key(region_id))
176 && participating_regions.iter().all(|region_id| {
177 let checkpoint = self.checkpoints.get(region_id);
178 watermark_map
179 .get(region_id)
180 .zip(checkpoint)
181 .is_some_and(|(seq, checkpoint)| seq >= checkpoint)
182 })
183 }
184
185 pub fn get_next_start_query_time(
200 &self,
201 flow_id: FlowId,
202 time_window_size: &Option<Duration>,
203 min_refresh_duration: Duration,
204 max_timeout: Option<Duration>,
205 max_filter_num_per_query: usize,
206 prefer_short_incremental_cadence: bool,
207 ) -> Instant {
208 let lower = time_window_size.unwrap_or(min_refresh_duration);
210 let next_duration = self.last_query_duration.max(lower);
211 let next_duration = if let Some(max_timeout) = max_timeout {
212 next_duration.min(max_timeout)
213 } else {
214 next_duration
215 };
216
217 let cur_dirty_window_size = self.dirty_time_windows.window_size();
218 let max_query_update_range = (*time_window_size)
220 .unwrap_or_default()
221 .mul_f64(max_filter_num_per_query as f64);
222 if cur_dirty_window_size < max_query_update_range {
225 if prefer_short_incremental_cadence {
226 let next_duration = self.last_query_duration.max(min_refresh_duration);
230 let next_duration = if let Some(max_timeout) = max_timeout {
231 next_duration.min(max_timeout)
232 } else {
233 next_duration
234 };
235 self.last_update_time + next_duration
236 } else {
237 self.last_update_time + next_duration
238 }
239 } else {
240 debug!(
243 "Flow id = {}, still have too many {} dirty time window({:?}), execute immediately",
244 flow_id,
245 self.dirty_time_windows.windows.len(),
246 self.dirty_time_windows.windows
247 );
248 Instant::now()
249 }
250 }
251}
252
253#[derive(Debug, Clone)]
256pub struct DirtyTimeWindows {
257 windows: BTreeMap<Timestamp, Option<Timestamp>>,
260 max_filter_num_per_query: usize,
262 time_window_merge_threshold: usize,
265}
266
267impl DirtyTimeWindows {
268 pub fn new(max_filter_num_per_query: usize, time_window_merge_threshold: usize) -> Self {
269 Self {
270 windows: BTreeMap::new(),
271 max_filter_num_per_query,
272 time_window_merge_threshold,
273 }
274 }
275
276 #[cfg(test)]
277 pub(crate) fn max_filter_num_per_query(&self) -> usize {
278 self.max_filter_num_per_query
279 }
280
281 #[cfg(test)]
282 pub(crate) fn time_window_merge_threshold(&self) -> usize {
283 self.time_window_merge_threshold
284 }
285}
286
287impl Default for DirtyTimeWindows {
288 fn default() -> Self {
289 Self {
290 windows: BTreeMap::new(),
291 max_filter_num_per_query: 20,
292 time_window_merge_threshold: 3,
293 }
294 }
295}
296
297impl DirtyTimeWindows {
298 pub const MERGE_DIST: i32 = 3;
302
303 pub fn add_lower_bounds(&mut self, lower_bounds: impl Iterator<Item = Timestamp>) {
309 for lower_bound in lower_bounds {
310 let entry = self.windows.entry(lower_bound);
311 entry.or_insert(None);
312 }
313 }
314
315 pub fn window_size(&self) -> Duration {
316 let mut ret = Duration::from_secs(0);
317 for (start, end) in &self.windows {
318 if let Some(end) = end
319 && let Some(duration) = end.sub(start)
320 {
321 ret += duration.to_std().unwrap_or_default();
322 }
323 }
324 ret
325 }
326
327 pub fn add_window(&mut self, start: Timestamp, end: Option<Timestamp>) {
328 self.add_or_merge_window(start, end);
329 }
330
331 pub fn add_windows(&mut self, time_ranges: Vec<(Timestamp, Timestamp)>) {
332 for (start, end) in time_ranges {
333 self.add_or_merge_window(start, Some(end));
334 }
335 }
336
337 pub fn add_dirty_windows(&mut self, dirty_windows: &DirtyTimeWindows) {
339 for (start, end) in &dirty_windows.windows {
340 self.add_or_merge_window(*start, *end);
341 }
342 }
343
344 fn add_or_merge_window(&mut self, start: Timestamp, end: Option<Timestamp>) {
345 self.windows
346 .entry(start)
347 .and_modify(|current_end| {
348 *current_end = Self::union_window_end(*current_end, end);
349 })
350 .or_insert(end);
351 }
352
353 fn union_window_end(
354 current_end: Option<Timestamp>,
355 incoming_end: Option<Timestamp>,
356 ) -> Option<Timestamp> {
357 match (current_end, incoming_end) {
358 (Some(current), Some(incoming)) => Some(current.max(incoming)),
359 (Some(end), None) | (None, Some(end)) => Some(end),
363 (None, None) => None,
364 }
365 }
366
367 pub fn clean(&mut self) {
369 self.windows.clear();
370 }
371
372 pub fn set_dirty(&mut self) {
375 self.add_or_merge_window(Timestamp::new_second(0), None);
376 }
377
378 pub fn len(&self) -> usize {
380 self.windows.len()
381 }
382
383 pub fn is_empty(&self) -> bool {
384 self.windows.is_empty()
385 }
386
387 pub fn effective_count(&self, window_size: &Duration) -> usize {
390 if self.windows.is_empty() {
391 return 0;
392 }
393 let window_size =
394 chrono::Duration::from_std(*window_size).unwrap_or(chrono::Duration::zero());
395 let total_window_time_range =
396 self.windows
397 .iter()
398 .fold(chrono::Duration::zero(), |acc, (start, end)| {
399 if let Some(end) = end {
400 acc + end.sub(start).unwrap_or(chrono::Duration::zero())
401 } else {
402 acc + window_size
403 }
404 });
405
406 if window_size.num_seconds() == 0 {
408 0
409 } else {
410 (total_window_time_range.num_seconds() / window_size.num_seconds()) as usize
411 }
412 }
413
414 pub fn gen_filter_exprs(
420 &mut self,
421 col_name: &str,
422 expire_lower_bound: Option<Timestamp>,
423 window_size: chrono::Duration,
424 window_cnt: usize,
425 flow_id: FlowId,
426 task_ctx: Option<&BatchingTask>,
427 ) -> Result<Option<FilterExprInfo>, Error> {
428 ensure!(
429 window_size.num_seconds() > 0,
430 UnexpectedSnafu {
431 reason: "window_size is zero, can't generate filter exprs",
432 }
433 );
434
435 debug!(
436 "expire_lower_bound: {:?}, window_size: {:?}",
437 expire_lower_bound.map(|t| t.to_iso8601_string()),
438 window_size
439 );
440 self.merge_dirty_time_windows(window_size, expire_lower_bound)?;
441
442 if self.windows.len() > window_cnt {
443 let first_time_window = self.windows.first_key_value();
444 let last_time_window = self.windows.last_key_value();
445
446 if let Some(task_ctx) = task_ctx {
447 warn!(
448 "Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. Time window expr={:?}, expire_after={:?}, first_time_window={:?}, last_time_window={:?}, the original query: {:?}",
449 task_ctx.config.flow_id,
450 self.windows.len(),
451 window_cnt,
452 task_ctx.config.time_window_expr,
453 task_ctx.config.expire_after,
454 first_time_window,
455 last_time_window,
456 task_ctx.config.query
457 );
458 } else {
459 warn!(
460 "Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. first_time_window={:?}, last_time_window={:?}",
461 flow_id,
462 self.windows.len(),
463 window_cnt,
464 first_time_window,
465 last_time_window
466 )
467 }
468 }
469
470 let max_time_range = window_size * window_cnt as i32;
472
473 let mut to_be_query = BTreeMap::new();
474 let mut new_windows = self.windows.clone();
475 let mut cur_time_range = chrono::Duration::zero();
476 for (idx, (start, end)) in self.windows.iter().enumerate() {
477 let first_end = start
478 .add_duration(window_size.to_std().unwrap())
479 .context(TimeSnafu)?;
480 let end = end.unwrap_or(first_end);
481
482 if cur_time_range >= max_time_range {
484 break;
485 }
486
487 if idx >= window_cnt {
489 break;
490 }
491
492 let Some(x) = end.sub(start) else {
493 continue;
494 };
495 if cur_time_range + x <= max_time_range {
496 to_be_query.insert(*start, Some(end));
497 new_windows.remove(start);
498 cur_time_range += x;
499 } else {
500 let surplus = max_time_range - cur_time_range;
503 if surplus.num_seconds() <= window_size.num_seconds() {
504 break;
506 }
507 let times = surplus.num_seconds() / window_size.num_seconds();
508
509 let split_offset = window_size * times as i32;
510 let split_at = start
511 .add_duration(split_offset.to_std().unwrap())
512 .context(TimeSnafu)?;
513 to_be_query.insert(*start, Some(split_at));
514
515 new_windows.remove(start);
517 new_windows.insert(split_at, Some(end));
518 cur_time_range += split_offset;
519 break;
520 }
521 }
522
523 self.windows = new_windows;
524
525 METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_CNT
526 .with_label_values(&[flow_id.to_string().as_str()])
527 .observe(to_be_query.len() as f64);
528
529 let full_time_range = to_be_query
530 .iter()
531 .fold(chrono::Duration::zero(), |acc, (start, end)| {
532 if let Some(end) = end {
533 acc + end.sub(start).unwrap_or(chrono::Duration::zero())
534 } else {
535 acc + window_size
536 }
537 })
538 .num_seconds() as f64;
539 METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_SIZE
540 .with_label_values(&[flow_id.to_string().as_str()])
541 .observe(full_time_range);
542
543 let stalled_time_range =
544 self.windows
545 .iter()
546 .fold(chrono::Duration::zero(), |acc, (start, end)| {
547 if let Some(end) = end {
548 acc + end.sub(start).unwrap_or(chrono::Duration::zero())
549 } else {
550 acc + window_size
551 }
552 });
553
554 METRIC_FLOW_BATCHING_ENGINE_STALLED_WINDOW_SIZE
555 .with_label_values(&[flow_id.to_string().as_str()])
556 .observe(stalled_time_range.num_seconds() as f64);
557
558 let std_window_size = window_size.to_std().map_err(|e| {
559 InternalSnafu {
560 reason: e.to_string(),
561 }
562 .build()
563 })?;
564
565 let mut expr_lst = vec![];
566 let mut time_ranges = vec![];
567 for (start, end) in to_be_query.into_iter() {
568 let (start, end) = if let Some(ctx) = task_ctx {
570 let Some(time_window_expr) = &ctx.config.time_window_expr else {
571 UnexpectedSnafu {
572 reason: "time_window_expr is not set",
573 }
574 .fail()?
575 };
576 self.align_time_window(start, end, time_window_expr)?
577 } else {
578 (start, end)
579 };
580 let end = end.unwrap_or(start.add_duration(std_window_size).context(TimeSnafu)?);
581 time_ranges.push((start, end));
582
583 debug!(
584 "Time window start: {:?}, end: {:?}",
585 start.to_iso8601_string(),
586 end.to_iso8601_string()
587 );
588
589 use datafusion_expr::{col, lit};
590 let lower = to_df_literal(start)?;
591 let upper = to_df_literal(end)?;
592 let expr = col(col_name)
593 .gt_eq(lit(lower))
594 .and(col(col_name).lt(lit(upper)));
595 expr_lst.push(expr);
596 }
597 let expr = expr_lst.into_iter().reduce(|a, b| a.or(b));
598 let ret = expr.map(|expr| FilterExprInfo {
599 expr,
600 col_name: col_name.to_string(),
601 time_ranges,
602 window_size,
603 });
604 Ok(ret)
605 }
606
607 fn align_time_window(
608 &self,
609 start: Timestamp,
610 end: Option<Timestamp>,
611 time_window_expr: &TimeWindowExpr,
612 ) -> Result<(Timestamp, Option<Timestamp>), Error> {
613 let align_start = time_window_expr.eval(start)?.0.context(UnexpectedSnafu {
614 reason: format!(
615 "Failed to align start time {:?} with time window expr {:?}",
616 start, time_window_expr
617 ),
618 })?;
619 let align_end = end
620 .and_then(|end| {
621 time_window_expr
622 .eval(end)
623 .map(|r| if r.0 == Some(end) { r.0 } else { r.1 })
625 .transpose()
626 })
627 .transpose()?;
628 Ok((align_start, align_end))
629 }
630
631 pub fn merge_dirty_time_windows(
635 &mut self,
636 window_size: chrono::Duration,
637 expire_lower_bound: Option<Timestamp>,
638 ) -> Result<(), Error> {
639 if self.windows.is_empty() {
640 return Ok(());
641 }
642
643 let mut new_windows = BTreeMap::new();
644
645 let std_window_size = window_size.to_std().map_err(|e| {
646 InternalSnafu {
647 reason: e.to_string(),
648 }
649 .build()
650 })?;
651
652 let mut prev_tw = None;
654 for (lower_bound, upper_bound) in std::mem::take(&mut self.windows) {
655 if let Some(expire_lower_bound) = expire_lower_bound
657 && lower_bound < expire_lower_bound
658 {
659 continue;
660 }
661
662 let Some(prev_tw) = &mut prev_tw else {
663 prev_tw = Some((lower_bound, upper_bound));
664 continue;
665 };
666
667 let prev_upper = prev_tw
670 .1
671 .unwrap_or(prev_tw.0.add_duration(std_window_size).context(TimeSnafu)?);
672 prev_tw.1 = Some(prev_upper);
673
674 let cur_upper = upper_bound.unwrap_or(
675 lower_bound
676 .add_duration(std_window_size)
677 .context(TimeSnafu)?,
678 );
679
680 if lower_bound
681 .sub(&prev_upper)
682 .map(|dist| dist <= window_size * self.time_window_merge_threshold as i32)
683 .unwrap_or(false)
684 {
685 prev_tw.1 = Some(cur_upper);
686 } else {
687 new_windows.insert(prev_tw.0, prev_tw.1);
688 *prev_tw = (lower_bound, Some(cur_upper));
689 }
690 }
691
692 if let Some(prev_tw) = prev_tw {
693 new_windows.insert(prev_tw.0, prev_tw.1);
694 }
695
696 self.windows = new_windows;
697
698 Ok(())
699 }
700}
701
702pub(crate) fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
703 let value = Value::from(value);
704 let value = value
705 .try_to_scalar_value(&value.data_type())
706 .with_context(|_| DatatypesSnafu {
707 extra: format!("Failed to convert to scalar value: {}", value),
708 })?;
709 Ok(value)
710}
711
712#[derive(Debug, Clone)]
713enum ExecState {
714 Idle,
715 Executing,
716}
717
718#[derive(Debug, Clone, Copy, PartialEq, Eq)]
719pub enum CheckpointMode {
720 FullSnapshot,
721 Incremental,
722}
723
724#[derive(Debug, Clone)]
726pub struct FilterExprInfo {
727 pub expr: datafusion_expr::Expr,
728 pub col_name: String,
729 pub time_ranges: Vec<(Timestamp, Timestamp)>,
730 pub window_size: chrono::Duration,
731}
732
733impl FilterExprInfo {
734 pub fn total_window_length(&self) -> chrono::Duration {
735 self.time_ranges
736 .iter()
737 .fold(chrono::Duration::zero(), |acc, (start, end)| {
738 acc + end.sub(start).unwrap_or(chrono::Duration::zero())
739 })
740 }
741
742 pub fn predicate_for_col(
743 &self,
744 col_name: &str,
745 ) -> Result<Option<datafusion_expr::Expr>, Error> {
746 use datafusion_common::Column;
747 use datafusion_expr::{Expr, lit};
748
749 let mut expr_lst = Vec::with_capacity(self.time_ranges.len());
750 for (start, end) in &self.time_ranges {
751 let lower = to_df_literal(*start)?;
752 let upper = to_df_literal(*end)?;
753 let filter_col = || Expr::Column(Column::new_unqualified(col_name));
754 expr_lst.push(
755 filter_col()
756 .gt_eq(lit(lower))
757 .and(filter_col().lt(lit(upper))),
758 );
759 }
760
761 Ok(expr_lst.into_iter().reduce(|a, b| a.or(b)))
762 }
763}
764
765#[cfg(test)]
766mod test {
767 use pretty_assertions::assert_eq;
768 use session::context::QueryContext;
769
770 use super::*;
771 use crate::batching_mode::time_window::find_time_window_expr;
772 use crate::batching_mode::utils::sql_to_df_plan;
773 use crate::test_utils::create_test_query_engine;
774
775 #[test]
776 fn test_task_state_records_last_execution_time() {
777 let query_ctx = QueryContext::arc();
778 let (_tx, rx) = tokio::sync::oneshot::channel();
779 let mut state = TaskState::new(query_ctx, rx);
780
781 assert_eq!(None, state.last_execution_time_millis());
782 state.after_query_exec(std::time::Duration::from_millis(1), false);
783 assert_eq!(None, state.last_execution_time_millis());
784
785 state.after_query_exec(std::time::Duration::from_millis(1), true);
786 assert!(state.last_execution_time_millis().is_some());
787 }
788
789 #[test]
790 fn test_merge_dirty_time_windows() {
791 let merge_dist = DirtyTimeWindows::default().time_window_merge_threshold;
792 let testcases = vec![
793 (
795 vec![
796 Timestamp::new_second(0),
797 Timestamp::new_second((1 + merge_dist as i64) * 5 * 60),
798 ],
799 (chrono::Duration::seconds(5 * 60), None),
800 BTreeMap::from([(
801 Timestamp::new_second(0),
802 Some(Timestamp::new_second((2 + merge_dist as i64) * 5 * 60)),
803 )]),
804 Some(
805 "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:25:00' AS TIMESTAMP)))",
806 ),
807 ),
808 (
810 vec![
811 Timestamp::new_second(0),
812 Timestamp::new_second((2 + merge_dist as i64) * 5 * 60),
813 ],
814 (chrono::Duration::seconds(5 * 60), None),
815 BTreeMap::from([
816 (
817 Timestamp::new_second(0),
818 Some(Timestamp::new_second(5 * 60)),
819 ),
820 (
821 Timestamp::new_second((2 + merge_dist as i64) * 5 * 60),
822 Some(Timestamp::new_second((3 + merge_dist as i64) * 5 * 60)),
823 ),
824 ]),
825 Some(
826 "(((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:05:00' AS TIMESTAMP))) OR ((ts >= CAST('1970-01-01 00:25:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:30:00' AS TIMESTAMP))))",
827 ),
828 ),
829 (
831 vec![
832 Timestamp::new_second(0),
833 Timestamp::new_second((merge_dist as i64) * 5 * 60),
834 ],
835 (chrono::Duration::seconds(5 * 60), None),
836 BTreeMap::from([(
837 Timestamp::new_second(0),
838 Some(Timestamp::new_second((1 + merge_dist as i64) * 5 * 60)),
839 )]),
840 Some(
841 "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:20:00' AS TIMESTAMP)))",
842 ),
843 ),
844 (
846 vec![
847 Timestamp::new_second(0),
848 Timestamp::new_second((merge_dist as i64) * 3),
849 Timestamp::new_second((merge_dist as i64) * 3 * 2),
850 ],
851 (chrono::Duration::seconds(3), None),
852 BTreeMap::from([(
853 Timestamp::new_second(0),
854 Some(Timestamp::new_second((merge_dist as i64) * 7)),
855 )]),
856 Some(
857 "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:00:21' AS TIMESTAMP)))",
858 ),
859 ),
860 (
862 Vec::from_iter((0..20).map(|i| Timestamp::new_second(i * 3)).chain(
863 std::iter::once(Timestamp::new_second(
864 60 + 3 * (DirtyTimeWindows::MERGE_DIST as i64 + 1),
865 )),
866 )),
867 (chrono::Duration::seconds(3), None),
868 BTreeMap::from([
869 (Timestamp::new_second(0), Some(Timestamp::new_second(60))),
870 (
871 Timestamp::new_second(60 + 3 * (DirtyTimeWindows::MERGE_DIST as i64 + 1)),
872 Some(Timestamp::new_second(
873 60 + 3 * (DirtyTimeWindows::MERGE_DIST as i64 + 1) + 3,
874 )),
875 ),
876 ]),
877 Some(
878 "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:01:00' AS TIMESTAMP)))",
879 ),
880 ),
881 (
883 Vec::from_iter((0..40).map(|i| Timestamp::new_second(i * 3))),
884 (chrono::Duration::seconds(3), None),
885 BTreeMap::from([(
886 Timestamp::new_second(0),
887 Some(Timestamp::new_second(40 * 3)),
888 )]),
889 Some(
890 "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:01:00' AS TIMESTAMP)))",
891 ),
892 ),
893 (
895 Vec::from_iter(
896 std::iter::once(Timestamp::new_second(0))
897 .chain((0..40).map(|i| Timestamp::new_second(20 + i * 3))),
898 ),
899 (chrono::Duration::seconds(3), None),
900 BTreeMap::from([
901 (Timestamp::new_second(0), Some(Timestamp::new_second(3))),
902 (Timestamp::new_second(20), Some(Timestamp::new_second(140))),
903 ]),
904 Some(
905 "(((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:00:03' AS TIMESTAMP))) OR ((ts >= CAST('1970-01-01 00:00:20' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:01:17' AS TIMESTAMP))))",
906 ),
907 ),
908 (
910 vec![
911 Timestamp::new_second(0),
912 Timestamp::new_second((merge_dist as i64) * 5 * 60),
913 ],
914 (
915 chrono::Duration::seconds(5 * 60),
916 Some(Timestamp::new_second((merge_dist as i64) * 6 * 60)),
917 ),
918 BTreeMap::from([]),
919 None,
920 ),
921 ];
922 for (lower_bounds, (window_size, expire_lower_bound), expected, expected_filter_expr) in
925 testcases
926 {
927 let mut dirty = DirtyTimeWindows::default();
928 dirty.add_lower_bounds(lower_bounds.into_iter());
929 dirty
930 .merge_dirty_time_windows(window_size, expire_lower_bound)
931 .unwrap();
932 assert_eq!(expected, dirty.windows);
933 let filter_expr = dirty
934 .gen_filter_exprs(
935 "ts",
936 expire_lower_bound,
937 window_size,
938 dirty.max_filter_num_per_query,
939 0,
940 None,
941 )
942 .unwrap()
943 .map(|e| e.expr);
944
945 let unparser = datafusion::sql::unparser::Unparser::default();
946 let to_sql = filter_expr
947 .as_ref()
948 .map(|e| unparser.expr_to_sql(e).unwrap().to_string());
949 assert_eq!(expected_filter_expr, to_sql.as_deref());
950 }
951 }
952
953 #[tokio::test]
954 async fn test_align_time_window() {
955 type TimeWindow = (Timestamp, Option<Timestamp>);
956 struct TestCase {
957 sql: String,
958 aligns: Vec<(TimeWindow, TimeWindow)>,
959 }
960 let testcases: Vec<TestCase> = vec![TestCase{
961 sql: "SELECT date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window;".to_string(),
962 aligns: vec![
963 ((Timestamp::new_second(3), None), (Timestamp::new_second(0), None)),
964 ((Timestamp::new_second(8), None), (Timestamp::new_second(5), None)),
965 ((Timestamp::new_second(8), Some(Timestamp::new_second(10))), (Timestamp::new_second(5), Some(Timestamp::new_second(10)))),
966 ((Timestamp::new_second(8), Some(Timestamp::new_second(9))), (Timestamp::new_second(5), Some(Timestamp::new_second(10)))),
967 ],
968 }];
969
970 let query_engine = create_test_query_engine();
971 let ctx = QueryContext::arc();
972 for TestCase { sql, aligns } in testcases {
973 let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), &sql, true)
974 .await
975 .unwrap();
976
977 let (column_name, time_window_expr, _, df_schema) = find_time_window_expr(
978 &plan,
979 query_engine.engine_state().catalog_manager().clone(),
980 ctx.clone(),
981 )
982 .await
983 .unwrap();
984
985 let time_window_expr = time_window_expr
986 .map(|expr| {
987 TimeWindowExpr::from_expr(
988 &expr,
989 &column_name,
990 &df_schema,
991 &query_engine.engine_state().session_state(),
992 )
993 })
994 .transpose()
995 .unwrap()
996 .unwrap();
997
998 let dirty = DirtyTimeWindows::default();
999 for (before_align, expected_after_align) in aligns {
1000 let after_align = dirty
1001 .align_time_window(before_align.0, before_align.1, &time_window_expr)
1002 .unwrap();
1003 assert_eq!(expected_after_align, after_align);
1004 }
1005 }
1006 }
1007
1008 #[test]
1009 fn test_task_state_checkpoint_mode_and_advancement() {
1010 let query_ctx = QueryContext::arc();
1011 let (_tx, rx) = tokio::sync::oneshot::channel();
1012 let mut state = TaskState::new(query_ctx, rx);
1013
1014 assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1015 assert!(state.checkpoints().is_empty());
1016
1017 state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
1018 assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
1019 assert_eq!(
1020 state.checkpoints(),
1021 &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
1022 );
1023
1024 state.mark_full_snapshot();
1025 assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1026 assert_eq!(
1027 state.checkpoints(),
1028 &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
1029 );
1030 }
1031
1032 #[test]
1033 fn test_disable_incremental_persists_full_snapshot_mode() {
1034 let query_ctx = QueryContext::arc();
1035 let (_tx, rx) = tokio::sync::oneshot::channel();
1036 let mut state = TaskState::new(query_ctx, rx);
1037
1038 assert!(!state.is_incremental_disabled());
1039
1040 state.disable_incremental();
1042 assert!(state.is_incremental_disabled());
1043 assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1044
1045 state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
1047 assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1048 assert_eq!(
1049 state.checkpoints(),
1050 &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
1051 );
1052
1053 state.mark_full_snapshot();
1055 assert!(state.is_incremental_disabled());
1056 assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1057 }
1058
1059 #[test]
1060 fn test_full_snapshot_checkpoint_advancement_requires_participating_regions() {
1061 let query_ctx = QueryContext::arc();
1062 let (_tx, rx) = tokio::sync::oneshot::channel();
1063 let state = TaskState::new(query_ctx, rx);
1064
1065 assert!(!state.can_advance_full_snapshot_checkpoints(&BTreeSet::new(), &HashMap::new()));
1066 assert!(!state.can_advance_full_snapshot_checkpoints(
1067 &BTreeSet::from([1_u64, 2_u64]),
1068 &HashMap::from([(1_u64, 10_u64)]),
1069 ));
1070 assert!(state.can_advance_full_snapshot_checkpoints(
1071 &BTreeSet::from([1_u64, 2_u64]),
1072 &HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]),
1073 ));
1074 }
1075
1076 #[test]
1077 fn test_incremental_checkpoint_advancement_requires_participation_alignment() {
1078 let query_ctx = QueryContext::arc();
1079 let (_tx, rx) = tokio::sync::oneshot::channel();
1080 let mut state = TaskState::new(query_ctx, rx);
1081 state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
1082
1083 assert!(
1084 state.can_advance_incremental_checkpoints_with_participation(
1085 &BTreeSet::from([1_u64]),
1086 &HashMap::from([(1_u64, 11_u64)]),
1087 )
1088 );
1089 assert!(
1090 !state.can_advance_incremental_checkpoints_with_participation(
1091 &BTreeSet::from([1_u64, 2_u64]),
1092 &HashMap::from([(1_u64, 11_u64)]),
1093 )
1094 );
1095 assert!(
1096 !state.can_advance_incremental_checkpoints_with_participation(
1097 &BTreeSet::from([3_u64]),
1098 &HashMap::from([(3_u64, 11_u64)]),
1099 )
1100 );
1101 assert!(
1102 !state.can_advance_incremental_checkpoints_with_participation(
1103 &BTreeSet::from([1_u64]),
1104 &HashMap::from([(1_u64, 9_u64)]),
1105 )
1106 );
1107 assert!(
1108 state.can_advance_incremental_checkpoints_with_participation(
1109 &BTreeSet::from([1_u64, 2_u64]),
1110 &HashMap::from([(1_u64, 11_u64), (2_u64, 21_u64)]),
1111 )
1112 );
1113
1114 state.disable_incremental();
1115 assert!(
1116 !state.can_advance_incremental_checkpoints_with_participation(
1117 &BTreeSet::from([1_u64, 2_u64]),
1118 &HashMap::from([(1_u64, 12_u64), (2_u64, 22_u64)]),
1119 )
1120 );
1121 }
1122
1123 #[test]
1124 fn test_incremental_checkpoint_advancement_merges_participating_subset() {
1125 let query_ctx = QueryContext::arc();
1126 let (_tx, rx) = tokio::sync::oneshot::channel();
1127 let mut state = TaskState::new(query_ctx, rx);
1128 state.advance_checkpoints(HashMap::from([
1129 (1_u64, 10_u64),
1130 (2_u64, 20_u64),
1131 (3_u64, 30_u64),
1132 ]));
1133
1134 state.advance_incremental_checkpoints_with_participation(
1135 &BTreeSet::from([1_u64, 3_u64]),
1136 HashMap::from([(1_u64, 12_u64), (3_u64, 35_u64)]),
1137 );
1138
1139 assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
1140 assert_eq!(
1141 state.checkpoints(),
1142 &BTreeMap::from([(1_u64, 12_u64), (2_u64, 20_u64), (3_u64, 35_u64)])
1143 );
1144 }
1145
1146 #[test]
1147 fn test_filter_expr_info_predicate_for_col_empty_ranges() {
1148 let filter = FilterExprInfo {
1149 expr: datafusion_expr::col("ts"),
1150 col_name: "ts".to_string(),
1151 time_ranges: vec![],
1152 window_size: chrono::Duration::seconds(1),
1153 };
1154
1155 assert!(filter.predicate_for_col("time_window").unwrap().is_none());
1156 }
1157
1158 #[test]
1159 fn test_filter_expr_info_predicate_for_col_single_range() {
1160 let filter = FilterExprInfo {
1161 expr: datafusion_expr::col("ts"),
1162 col_name: "ts".to_string(),
1163 time_ranges: vec![(Timestamp::new_second(0), Timestamp::new_second(1))],
1164 window_size: chrono::Duration::seconds(1),
1165 };
1166
1167 let predicate = filter.predicate_for_col("time_window").unwrap().unwrap();
1168 let unparser = datafusion::sql::unparser::Unparser::default();
1169 assert_eq!(
1170 "((time_window >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:01' AS TIMESTAMP)))",
1171 unparser.expr_to_sql(&predicate).unwrap().to_string()
1172 );
1173 }
1174
1175 #[test]
1176 fn test_filter_expr_info_predicate_for_col_multiple_ranges() {
1177 let filter = FilterExprInfo {
1178 expr: datafusion_expr::col("ts"),
1179 col_name: "ts".to_string(),
1180 time_ranges: vec![
1181 (Timestamp::new_second(0), Timestamp::new_second(1)),
1182 (Timestamp::new_second(10), Timestamp::new_second(11)),
1183 ],
1184 window_size: chrono::Duration::seconds(1),
1185 };
1186
1187 let predicate = filter.predicate_for_col("time_window").unwrap().unwrap();
1188 let unparser = datafusion::sql::unparser::Unparser::default();
1189 assert_eq!(
1190 "(((time_window >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:01' AS TIMESTAMP))) OR ((time_window >= CAST('1970-01-01 00:00:10' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:11' AS TIMESTAMP))))",
1191 unparser.expr_to_sql(&predicate).unwrap().to_string()
1192 );
1193 }
1194
1195 fn state_with_past_update(age: Duration) -> TaskState {
1197 let query_ctx = QueryContext::arc();
1198 let (_tx, rx) = tokio::sync::oneshot::channel();
1199 let mut state = TaskState::new(query_ctx, rx);
1200 state.last_update_time = Instant::now() - age;
1201 state
1202 }
1203
1204 #[test]
1205 fn test_short_incremental_cadence_uses_min_refresh() {
1206 let state = state_with_past_update(Duration::from_secs(10));
1210
1211 let time_window_size = Some(Duration::from_secs(60)); let min_refresh = Duration::from_secs(5);
1213 let flow_id = 1;
1214
1215 let result = state.get_next_start_query_time(
1216 flow_id,
1217 &time_window_size,
1218 min_refresh,
1219 None,
1220 20,
1221 true, );
1223
1224 let expected = state.last_update_time + min_refresh;
1226 assert_eq!(result, expected);
1227 }
1228
1229 #[test]
1230 fn test_short_incremental_cadence_respects_last_query_duration() {
1231 let mut state = state_with_past_update(Duration::from_secs(10));
1232 state.last_query_duration = Duration::from_secs(20);
1233
1234 let time_window_size = Some(Duration::from_secs(60));
1235 let min_refresh = Duration::from_secs(5);
1236 let flow_id = 1;
1237
1238 let result = state.get_next_start_query_time(
1239 flow_id,
1240 &time_window_size,
1241 min_refresh,
1242 None,
1243 20,
1244 true,
1245 );
1246
1247 assert_eq!(result, state.last_update_time + state.last_query_duration);
1248 }
1249
1250 #[test]
1251 fn test_short_incremental_cadence_respects_max_timeout() {
1252 let mut state = state_with_past_update(Duration::from_secs(10));
1253 state.last_query_duration = Duration::from_secs(20);
1254
1255 let time_window_size = Some(Duration::from_secs(60));
1256 let min_refresh = Duration::from_secs(30);
1257 let max_timeout = Duration::from_secs(5);
1258 let flow_id = 1;
1259
1260 let result = state.get_next_start_query_time(
1261 flow_id,
1262 &time_window_size,
1263 min_refresh,
1264 Some(max_timeout),
1265 20,
1266 true,
1267 );
1268
1269 assert_eq!(result, state.last_update_time + max_timeout);
1270 }
1271
1272 #[test]
1273 fn test_full_snapshot_ignores_short_cadence() {
1274 let mut state = state_with_past_update(Duration::from_secs(10));
1277 state.last_query_duration = Duration::from_secs(1);
1279
1280 let time_window_size = Some(Duration::from_secs(60)); let min_refresh = Duration::from_secs(5);
1282 let flow_id = 1;
1283
1284 let result = state.get_next_start_query_time(
1285 flow_id,
1286 &time_window_size,
1287 min_refresh,
1288 None,
1289 20,
1290 false, );
1292
1293 let expected = state.last_update_time + Duration::from_secs(60);
1296 assert_eq!(result, expected);
1297 }
1298
1299 #[test]
1300 fn test_dirty_window_overflow_schedules_immediately_even_with_short_cadence() {
1301 let mut state = state_with_past_update(Duration::from_secs(10));
1304 state
1306 .dirty_time_windows
1307 .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(3600)));
1308
1309 let time_window_size = Some(Duration::from_secs(1)); let min_refresh = Duration::from_secs(5);
1311 let flow_id = 1;
1312
1313 let result = state.get_next_start_query_time(
1315 flow_id,
1316 &time_window_size,
1317 min_refresh,
1318 None,
1319 1, true,
1321 );
1322 assert!(
1323 result <= Instant::now(),
1324 "dirty overflow should schedule immediately"
1325 );
1326
1327 let result2 = state.get_next_start_query_time(
1329 flow_id,
1330 &time_window_size,
1331 min_refresh,
1332 None,
1333 1,
1334 false,
1335 );
1336 assert!(
1337 result2 <= Instant::now(),
1338 "dirty overflow should schedule immediately"
1339 );
1340 }
1341
1342 #[test]
1343 fn test_incremental_disabled_ignores_short_cadence() {
1344 let mut state = state_with_past_update(Duration::from_secs(10));
1354 state.last_query_duration = Duration::from_secs(1);
1355
1356 let time_window_size = Some(Duration::from_secs(60));
1357 let min_refresh = Duration::from_secs(5);
1358 let flow_id = 1;
1359
1360 let result = state.get_next_start_query_time(
1361 flow_id,
1362 &time_window_size,
1363 min_refresh,
1364 None,
1365 20,
1366 false, );
1368
1369 let expected = state.last_update_time + Duration::from_secs(60);
1371 assert_eq!(result, expected);
1372 }
1373}