1use std::collections::{BTreeMap, BTreeSet, HashMap};
19use std::time::Duration;
20
21use common_telemetry::debug;
22use common_telemetry::tracing::warn;
23use common_time::Timestamp;
24use datatypes::value::Value;
25use session::context::QueryContextRef;
26use snafu::{OptionExt, ResultExt, ensure};
27use tokio::sync::oneshot;
28use tokio::time::Instant;
29
30use crate::batching_mode::task::BatchingTask;
31use crate::batching_mode::time_window::TimeWindowExpr;
32use crate::error::{DatatypesSnafu, InternalSnafu, TimeSnafu, UnexpectedSnafu};
33use crate::metrics::{
34 METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_CNT, METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_SIZE,
35 METRIC_FLOW_BATCHING_ENGINE_STALLED_WINDOW_SIZE,
36};
37use crate::{Error, FlowId};
38
39#[derive(Debug)]
41pub struct TaskState {
42 pub(crate) query_ctx: QueryContextRef,
44 last_update_time: Instant,
46 last_query_duration: Duration,
48 last_exec_time_millis: Option<i64>,
50 pub(crate) dirty_time_windows: DirtyTimeWindows,
53 checkpoint_mode: CheckpointMode,
54 checkpoints: BTreeMap<u64, u64>,
57 incremental_disabled: bool,
61 exec_state: ExecState,
62 pub(crate) shutdown_rx: oneshot::Receiver<()>,
64 pub(crate) task_handle: Option<tokio::task::JoinHandle<()>>,
66}
67impl TaskState {
68 pub fn new(query_ctx: QueryContextRef, shutdown_rx: oneshot::Receiver<()>) -> Self {
69 Self {
70 query_ctx,
71 last_update_time: Instant::now(),
72 last_query_duration: Duration::from_secs(0),
73 last_exec_time_millis: None,
74 dirty_time_windows: Default::default(),
75 checkpoint_mode: CheckpointMode::FullSnapshot,
76 checkpoints: Default::default(),
77 incremental_disabled: false,
78 exec_state: ExecState::Idle,
79 shutdown_rx,
80 task_handle: None,
81 }
82 }
83
84 pub fn after_query_exec(&mut self, elapsed: Duration, is_succ: bool) {
87 self.exec_state = ExecState::Idle;
88 self.last_query_duration = elapsed;
89 self.last_update_time = Instant::now();
90 if is_succ {
91 self.last_exec_time_millis = Some(common_time::util::current_time_millis());
92 }
93 }
94
95 pub fn last_execution_time_millis(&self) -> Option<i64> {
96 self.last_exec_time_millis
97 }
98
99 pub fn checkpoint_mode(&self) -> CheckpointMode {
100 self.checkpoint_mode
101 }
102
103 pub fn checkpoints(&self) -> &BTreeMap<u64, u64> {
104 &self.checkpoints
105 }
106
107 pub fn is_incremental_disabled(&self) -> bool {
108 self.incremental_disabled
109 }
110
111 pub fn disable_incremental(&mut self) {
114 self.incremental_disabled = true;
115 self.mark_full_snapshot();
116 }
117
118 pub fn mark_full_snapshot(&mut self) {
119 self.checkpoint_mode = CheckpointMode::FullSnapshot;
120 }
121
122 pub fn advance_checkpoints(&mut self, watermark_map: HashMap<u64, u64>) {
123 self.checkpoints = watermark_map.into_iter().collect();
124 if !self.incremental_disabled {
125 self.checkpoint_mode = CheckpointMode::Incremental;
126 }
127 }
128
129 pub fn advance_incremental_checkpoints_with_participation(
130 &mut self,
131 participating_regions: &BTreeSet<u64>,
132 watermark_map: HashMap<u64, u64>,
133 ) {
134 for region_id in participating_regions {
135 if let Some(seq) = watermark_map.get(region_id) {
136 self.checkpoints.insert(*region_id, *seq);
137 }
138 }
139 if !self.incremental_disabled {
140 self.checkpoint_mode = CheckpointMode::Incremental;
141 }
142 }
143
144 pub fn can_advance_full_snapshot_checkpoints(
145 &self,
146 participating_regions: &BTreeSet<u64>,
147 watermark_map: &HashMap<u64, u64>,
148 ) -> bool {
149 !participating_regions.is_empty()
150 && participating_regions.len() == watermark_map.len()
151 && participating_regions
152 .iter()
153 .all(|region_id| watermark_map.contains_key(region_id))
154 }
155
156 pub fn can_advance_incremental_checkpoints_with_participation(
157 &self,
158 participating_regions: &BTreeSet<u64>,
159 watermark_map: &HashMap<u64, u64>,
160 ) -> bool {
161 !self.incremental_disabled
162 && !self.checkpoints.is_empty()
163 && !participating_regions.is_empty()
164 && participating_regions.len() == watermark_map.len()
165 && participating_regions
166 .iter()
167 .all(|region_id| self.checkpoints.contains_key(region_id))
168 && participating_regions.iter().all(|region_id| {
169 let checkpoint = self.checkpoints.get(region_id);
170 watermark_map
171 .get(region_id)
172 .zip(checkpoint)
173 .is_some_and(|(seq, checkpoint)| seq >= checkpoint)
174 })
175 }
176
177 pub fn get_next_start_query_time(
192 &self,
193 flow_id: FlowId,
194 time_window_size: &Option<Duration>,
195 min_refresh_duration: Duration,
196 max_timeout: Option<Duration>,
197 max_filter_num_per_query: usize,
198 prefer_short_incremental_cadence: bool,
199 ) -> Instant {
200 let lower = time_window_size.unwrap_or(min_refresh_duration);
202 let next_duration = self.last_query_duration.max(lower);
203 let next_duration = if let Some(max_timeout) = max_timeout {
204 next_duration.min(max_timeout)
205 } else {
206 next_duration
207 };
208
209 let cur_dirty_window_size = self.dirty_time_windows.window_size();
210 let max_query_update_range = (*time_window_size)
212 .unwrap_or_default()
213 .mul_f64(max_filter_num_per_query as f64);
214 if cur_dirty_window_size < max_query_update_range {
217 if prefer_short_incremental_cadence {
218 let next_duration = self.last_query_duration.max(min_refresh_duration);
222 let next_duration = if let Some(max_timeout) = max_timeout {
223 next_duration.min(max_timeout)
224 } else {
225 next_duration
226 };
227 self.last_update_time + next_duration
228 } else {
229 self.last_update_time + next_duration
230 }
231 } else {
232 debug!(
235 "Flow id = {}, still have too many {} dirty time window({:?}), execute immediately",
236 flow_id,
237 self.dirty_time_windows.windows.len(),
238 self.dirty_time_windows.windows
239 );
240 Instant::now()
241 }
242 }
243}
244
245#[derive(Debug, Clone)]
248pub struct DirtyTimeWindows {
249 windows: BTreeMap<Timestamp, Option<Timestamp>>,
252 max_filter_num_per_query: usize,
254 time_window_merge_threshold: usize,
257}
258
259impl DirtyTimeWindows {
260 pub fn new(max_filter_num_per_query: usize, time_window_merge_threshold: usize) -> Self {
261 Self {
262 windows: BTreeMap::new(),
263 max_filter_num_per_query,
264 time_window_merge_threshold,
265 }
266 }
267}
268
269impl Default for DirtyTimeWindows {
270 fn default() -> Self {
271 Self {
272 windows: BTreeMap::new(),
273 max_filter_num_per_query: 20,
274 time_window_merge_threshold: 3,
275 }
276 }
277}
278
279impl DirtyTimeWindows {
280 pub const MERGE_DIST: i32 = 3;
284
285 pub fn add_lower_bounds(&mut self, lower_bounds: impl Iterator<Item = Timestamp>) {
291 for lower_bound in lower_bounds {
292 let entry = self.windows.entry(lower_bound);
293 entry.or_insert(None);
294 }
295 }
296
297 pub fn window_size(&self) -> Duration {
298 let mut ret = Duration::from_secs(0);
299 for (start, end) in &self.windows {
300 if let Some(end) = end
301 && let Some(duration) = end.sub(start)
302 {
303 ret += duration.to_std().unwrap_or_default();
304 }
305 }
306 ret
307 }
308
309 pub fn add_window(&mut self, start: Timestamp, end: Option<Timestamp>) {
310 self.add_or_merge_window(start, end);
311 }
312
313 pub fn add_windows(&mut self, time_ranges: Vec<(Timestamp, Timestamp)>) {
314 for (start, end) in time_ranges {
315 self.add_or_merge_window(start, Some(end));
316 }
317 }
318
319 pub fn add_dirty_windows(&mut self, dirty_windows: &DirtyTimeWindows) {
321 for (start, end) in &dirty_windows.windows {
322 self.add_or_merge_window(*start, *end);
323 }
324 }
325
326 fn add_or_merge_window(&mut self, start: Timestamp, end: Option<Timestamp>) {
327 self.windows
328 .entry(start)
329 .and_modify(|current_end| {
330 *current_end = Self::union_window_end(*current_end, end);
331 })
332 .or_insert(end);
333 }
334
335 fn union_window_end(
336 current_end: Option<Timestamp>,
337 incoming_end: Option<Timestamp>,
338 ) -> Option<Timestamp> {
339 match (current_end, incoming_end) {
340 (Some(current), Some(incoming)) => Some(current.max(incoming)),
341 (Some(end), None) | (None, Some(end)) => Some(end),
345 (None, None) => None,
346 }
347 }
348
349 pub fn clean(&mut self) {
351 self.windows.clear();
352 }
353
354 pub fn set_dirty(&mut self) {
357 self.add_or_merge_window(Timestamp::new_second(0), None);
358 }
359
360 pub fn len(&self) -> usize {
362 self.windows.len()
363 }
364
365 pub fn is_empty(&self) -> bool {
366 self.windows.is_empty()
367 }
368
369 pub fn effective_count(&self, window_size: &Duration) -> usize {
372 if self.windows.is_empty() {
373 return 0;
374 }
375 let window_size =
376 chrono::Duration::from_std(*window_size).unwrap_or(chrono::Duration::zero());
377 let total_window_time_range =
378 self.windows
379 .iter()
380 .fold(chrono::Duration::zero(), |acc, (start, end)| {
381 if let Some(end) = end {
382 acc + end.sub(start).unwrap_or(chrono::Duration::zero())
383 } else {
384 acc + window_size
385 }
386 });
387
388 if window_size.num_seconds() == 0 {
390 0
391 } else {
392 (total_window_time_range.num_seconds() / window_size.num_seconds()) as usize
393 }
394 }
395
396 pub fn gen_filter_exprs(
402 &mut self,
403 col_name: &str,
404 expire_lower_bound: Option<Timestamp>,
405 window_size: chrono::Duration,
406 window_cnt: usize,
407 flow_id: FlowId,
408 task_ctx: Option<&BatchingTask>,
409 ) -> Result<Option<FilterExprInfo>, Error> {
410 ensure!(
411 window_size.num_seconds() > 0,
412 UnexpectedSnafu {
413 reason: "window_size is zero, can't generate filter exprs",
414 }
415 );
416
417 debug!(
418 "expire_lower_bound: {:?}, window_size: {:?}",
419 expire_lower_bound.map(|t| t.to_iso8601_string()),
420 window_size
421 );
422 self.merge_dirty_time_windows(window_size, expire_lower_bound)?;
423
424 if self.windows.len() > window_cnt {
425 let first_time_window = self.windows.first_key_value();
426 let last_time_window = self.windows.last_key_value();
427
428 if let Some(task_ctx) = task_ctx {
429 warn!(
430 "Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. Time window expr={:?}, expire_after={:?}, first_time_window={:?}, last_time_window={:?}, the original query: {:?}",
431 task_ctx.config.flow_id,
432 self.windows.len(),
433 window_cnt,
434 task_ctx.config.time_window_expr,
435 task_ctx.config.expire_after,
436 first_time_window,
437 last_time_window,
438 task_ctx.config.query
439 );
440 } else {
441 warn!(
442 "Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. first_time_window={:?}, last_time_window={:?}",
443 flow_id,
444 self.windows.len(),
445 window_cnt,
446 first_time_window,
447 last_time_window
448 )
449 }
450 }
451
452 let max_time_range = window_size * window_cnt as i32;
454
455 let mut to_be_query = BTreeMap::new();
456 let mut new_windows = self.windows.clone();
457 let mut cur_time_range = chrono::Duration::zero();
458 for (idx, (start, end)) in self.windows.iter().enumerate() {
459 let first_end = start
460 .add_duration(window_size.to_std().unwrap())
461 .context(TimeSnafu)?;
462 let end = end.unwrap_or(first_end);
463
464 if cur_time_range >= max_time_range {
466 break;
467 }
468
469 if idx >= window_cnt {
471 break;
472 }
473
474 let Some(x) = end.sub(start) else {
475 continue;
476 };
477 if cur_time_range + x <= max_time_range {
478 to_be_query.insert(*start, Some(end));
479 new_windows.remove(start);
480 cur_time_range += x;
481 } else {
482 let surplus = max_time_range - cur_time_range;
485 if surplus.num_seconds() <= window_size.num_seconds() {
486 break;
488 }
489 let times = surplus.num_seconds() / window_size.num_seconds();
490
491 let split_offset = window_size * times as i32;
492 let split_at = start
493 .add_duration(split_offset.to_std().unwrap())
494 .context(TimeSnafu)?;
495 to_be_query.insert(*start, Some(split_at));
496
497 new_windows.remove(start);
499 new_windows.insert(split_at, Some(end));
500 cur_time_range += split_offset;
501 break;
502 }
503 }
504
505 self.windows = new_windows;
506
507 METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_CNT
508 .with_label_values(&[flow_id.to_string().as_str()])
509 .observe(to_be_query.len() as f64);
510
511 let full_time_range = to_be_query
512 .iter()
513 .fold(chrono::Duration::zero(), |acc, (start, end)| {
514 if let Some(end) = end {
515 acc + end.sub(start).unwrap_or(chrono::Duration::zero())
516 } else {
517 acc + window_size
518 }
519 })
520 .num_seconds() as f64;
521 METRIC_FLOW_BATCHING_ENGINE_QUERY_WINDOW_SIZE
522 .with_label_values(&[flow_id.to_string().as_str()])
523 .observe(full_time_range);
524
525 let stalled_time_range =
526 self.windows
527 .iter()
528 .fold(chrono::Duration::zero(), |acc, (start, end)| {
529 if let Some(end) = end {
530 acc + end.sub(start).unwrap_or(chrono::Duration::zero())
531 } else {
532 acc + window_size
533 }
534 });
535
536 METRIC_FLOW_BATCHING_ENGINE_STALLED_WINDOW_SIZE
537 .with_label_values(&[flow_id.to_string().as_str()])
538 .observe(stalled_time_range.num_seconds() as f64);
539
540 let std_window_size = window_size.to_std().map_err(|e| {
541 InternalSnafu {
542 reason: e.to_string(),
543 }
544 .build()
545 })?;
546
547 let mut expr_lst = vec![];
548 let mut time_ranges = vec![];
549 for (start, end) in to_be_query.into_iter() {
550 let (start, end) = if let Some(ctx) = task_ctx {
552 let Some(time_window_expr) = &ctx.config.time_window_expr else {
553 UnexpectedSnafu {
554 reason: "time_window_expr is not set",
555 }
556 .fail()?
557 };
558 self.align_time_window(start, end, time_window_expr)?
559 } else {
560 (start, end)
561 };
562 let end = end.unwrap_or(start.add_duration(std_window_size).context(TimeSnafu)?);
563 time_ranges.push((start, end));
564
565 debug!(
566 "Time window start: {:?}, end: {:?}",
567 start.to_iso8601_string(),
568 end.to_iso8601_string()
569 );
570
571 use datafusion_expr::{col, lit};
572 let lower = to_df_literal(start)?;
573 let upper = to_df_literal(end)?;
574 let expr = col(col_name)
575 .gt_eq(lit(lower))
576 .and(col(col_name).lt(lit(upper)));
577 expr_lst.push(expr);
578 }
579 let expr = expr_lst.into_iter().reduce(|a, b| a.or(b));
580 let ret = expr.map(|expr| FilterExprInfo {
581 expr,
582 col_name: col_name.to_string(),
583 time_ranges,
584 window_size,
585 });
586 Ok(ret)
587 }
588
589 fn align_time_window(
590 &self,
591 start: Timestamp,
592 end: Option<Timestamp>,
593 time_window_expr: &TimeWindowExpr,
594 ) -> Result<(Timestamp, Option<Timestamp>), Error> {
595 let align_start = time_window_expr.eval(start)?.0.context(UnexpectedSnafu {
596 reason: format!(
597 "Failed to align start time {:?} with time window expr {:?}",
598 start, time_window_expr
599 ),
600 })?;
601 let align_end = end
602 .and_then(|end| {
603 time_window_expr
604 .eval(end)
605 .map(|r| if r.0 == Some(end) { r.0 } else { r.1 })
607 .transpose()
608 })
609 .transpose()?;
610 Ok((align_start, align_end))
611 }
612
613 pub fn merge_dirty_time_windows(
617 &mut self,
618 window_size: chrono::Duration,
619 expire_lower_bound: Option<Timestamp>,
620 ) -> Result<(), Error> {
621 if self.windows.is_empty() {
622 return Ok(());
623 }
624
625 let mut new_windows = BTreeMap::new();
626
627 let std_window_size = window_size.to_std().map_err(|e| {
628 InternalSnafu {
629 reason: e.to_string(),
630 }
631 .build()
632 })?;
633
634 let mut prev_tw = None;
636 for (lower_bound, upper_bound) in std::mem::take(&mut self.windows) {
637 if let Some(expire_lower_bound) = expire_lower_bound
639 && lower_bound < expire_lower_bound
640 {
641 continue;
642 }
643
644 let Some(prev_tw) = &mut prev_tw else {
645 prev_tw = Some((lower_bound, upper_bound));
646 continue;
647 };
648
649 let prev_upper = prev_tw
652 .1
653 .unwrap_or(prev_tw.0.add_duration(std_window_size).context(TimeSnafu)?);
654 prev_tw.1 = Some(prev_upper);
655
656 let cur_upper = upper_bound.unwrap_or(
657 lower_bound
658 .add_duration(std_window_size)
659 .context(TimeSnafu)?,
660 );
661
662 if lower_bound
663 .sub(&prev_upper)
664 .map(|dist| dist <= window_size * self.time_window_merge_threshold as i32)
665 .unwrap_or(false)
666 {
667 prev_tw.1 = Some(cur_upper);
668 } else {
669 new_windows.insert(prev_tw.0, prev_tw.1);
670 *prev_tw = (lower_bound, Some(cur_upper));
671 }
672 }
673
674 if let Some(prev_tw) = prev_tw {
675 new_windows.insert(prev_tw.0, prev_tw.1);
676 }
677
678 self.windows = new_windows;
679
680 Ok(())
681 }
682}
683
684fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
685 let value = Value::from(value);
686 let value = value
687 .try_to_scalar_value(&value.data_type())
688 .with_context(|_| DatatypesSnafu {
689 extra: format!("Failed to convert to scalar value: {}", value),
690 })?;
691 Ok(value)
692}
693
694#[derive(Debug, Clone)]
695enum ExecState {
696 Idle,
697 Executing,
698}
699
700#[derive(Debug, Clone, Copy, PartialEq, Eq)]
701pub enum CheckpointMode {
702 FullSnapshot,
703 Incremental,
704}
705
706#[derive(Debug, Clone)]
708pub struct FilterExprInfo {
709 pub expr: datafusion_expr::Expr,
710 pub col_name: String,
711 pub time_ranges: Vec<(Timestamp, Timestamp)>,
712 pub window_size: chrono::Duration,
713}
714
715impl FilterExprInfo {
716 pub fn total_window_length(&self) -> chrono::Duration {
717 self.time_ranges
718 .iter()
719 .fold(chrono::Duration::zero(), |acc, (start, end)| {
720 acc + end.sub(start).unwrap_or(chrono::Duration::zero())
721 })
722 }
723
724 pub fn predicate_for_col(
725 &self,
726 col_name: &str,
727 ) -> Result<Option<datafusion_expr::Expr>, Error> {
728 use datafusion_common::Column;
729 use datafusion_expr::{Expr, lit};
730
731 let mut expr_lst = Vec::with_capacity(self.time_ranges.len());
732 for (start, end) in &self.time_ranges {
733 let lower = to_df_literal(*start)?;
734 let upper = to_df_literal(*end)?;
735 let filter_col = || Expr::Column(Column::new_unqualified(col_name));
736 expr_lst.push(
737 filter_col()
738 .gt_eq(lit(lower))
739 .and(filter_col().lt(lit(upper))),
740 );
741 }
742
743 Ok(expr_lst.into_iter().reduce(|a, b| a.or(b)))
744 }
745}
746
747#[cfg(test)]
748mod test {
749 use pretty_assertions::assert_eq;
750 use session::context::QueryContext;
751
752 use super::*;
753 use crate::batching_mode::time_window::find_time_window_expr;
754 use crate::batching_mode::utils::sql_to_df_plan;
755 use crate::test_utils::create_test_query_engine;
756
757 #[test]
758 fn test_task_state_records_last_execution_time() {
759 let query_ctx = QueryContext::arc();
760 let (_tx, rx) = tokio::sync::oneshot::channel();
761 let mut state = TaskState::new(query_ctx, rx);
762
763 assert_eq!(None, state.last_execution_time_millis());
764 state.after_query_exec(std::time::Duration::from_millis(1), false);
765 assert_eq!(None, state.last_execution_time_millis());
766
767 state.after_query_exec(std::time::Duration::from_millis(1), true);
768 assert!(state.last_execution_time_millis().is_some());
769 }
770
771 #[test]
772 fn test_merge_dirty_time_windows() {
773 let merge_dist = DirtyTimeWindows::default().time_window_merge_threshold;
774 let testcases = vec![
775 (
777 vec![
778 Timestamp::new_second(0),
779 Timestamp::new_second((1 + merge_dist as i64) * 5 * 60),
780 ],
781 (chrono::Duration::seconds(5 * 60), None),
782 BTreeMap::from([(
783 Timestamp::new_second(0),
784 Some(Timestamp::new_second((2 + merge_dist as i64) * 5 * 60)),
785 )]),
786 Some(
787 "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:25:00' AS TIMESTAMP)))",
788 ),
789 ),
790 (
792 vec![
793 Timestamp::new_second(0),
794 Timestamp::new_second((2 + merge_dist as i64) * 5 * 60),
795 ],
796 (chrono::Duration::seconds(5 * 60), None),
797 BTreeMap::from([
798 (
799 Timestamp::new_second(0),
800 Some(Timestamp::new_second(5 * 60)),
801 ),
802 (
803 Timestamp::new_second((2 + merge_dist as i64) * 5 * 60),
804 Some(Timestamp::new_second((3 + merge_dist as i64) * 5 * 60)),
805 ),
806 ]),
807 Some(
808 "(((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:05:00' AS TIMESTAMP))) OR ((ts >= CAST('1970-01-01 00:25:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:30:00' AS TIMESTAMP))))",
809 ),
810 ),
811 (
813 vec![
814 Timestamp::new_second(0),
815 Timestamp::new_second((merge_dist as i64) * 5 * 60),
816 ],
817 (chrono::Duration::seconds(5 * 60), None),
818 BTreeMap::from([(
819 Timestamp::new_second(0),
820 Some(Timestamp::new_second((1 + merge_dist as i64) * 5 * 60)),
821 )]),
822 Some(
823 "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:20:00' AS TIMESTAMP)))",
824 ),
825 ),
826 (
828 vec![
829 Timestamp::new_second(0),
830 Timestamp::new_second((merge_dist as i64) * 3),
831 Timestamp::new_second((merge_dist as i64) * 3 * 2),
832 ],
833 (chrono::Duration::seconds(3), None),
834 BTreeMap::from([(
835 Timestamp::new_second(0),
836 Some(Timestamp::new_second((merge_dist as i64) * 7)),
837 )]),
838 Some(
839 "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:00:21' AS TIMESTAMP)))",
840 ),
841 ),
842 (
844 Vec::from_iter((0..20).map(|i| Timestamp::new_second(i * 3)).chain(
845 std::iter::once(Timestamp::new_second(
846 60 + 3 * (DirtyTimeWindows::MERGE_DIST as i64 + 1),
847 )),
848 )),
849 (chrono::Duration::seconds(3), None),
850 BTreeMap::from([
851 (Timestamp::new_second(0), Some(Timestamp::new_second(60))),
852 (
853 Timestamp::new_second(60 + 3 * (DirtyTimeWindows::MERGE_DIST as i64 + 1)),
854 Some(Timestamp::new_second(
855 60 + 3 * (DirtyTimeWindows::MERGE_DIST as i64 + 1) + 3,
856 )),
857 ),
858 ]),
859 Some(
860 "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:01:00' AS TIMESTAMP)))",
861 ),
862 ),
863 (
865 Vec::from_iter((0..40).map(|i| Timestamp::new_second(i * 3))),
866 (chrono::Duration::seconds(3), None),
867 BTreeMap::from([(
868 Timestamp::new_second(0),
869 Some(Timestamp::new_second(40 * 3)),
870 )]),
871 Some(
872 "((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:01:00' AS TIMESTAMP)))",
873 ),
874 ),
875 (
877 Vec::from_iter(
878 std::iter::once(Timestamp::new_second(0))
879 .chain((0..40).map(|i| Timestamp::new_second(20 + i * 3))),
880 ),
881 (chrono::Duration::seconds(3), None),
882 BTreeMap::from([
883 (Timestamp::new_second(0), Some(Timestamp::new_second(3))),
884 (Timestamp::new_second(20), Some(Timestamp::new_second(140))),
885 ]),
886 Some(
887 "(((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:00:03' AS TIMESTAMP))) OR ((ts >= CAST('1970-01-01 00:00:20' AS TIMESTAMP)) AND (ts < CAST('1970-01-01 00:01:17' AS TIMESTAMP))))",
888 ),
889 ),
890 (
892 vec![
893 Timestamp::new_second(0),
894 Timestamp::new_second((merge_dist as i64) * 5 * 60),
895 ],
896 (
897 chrono::Duration::seconds(5 * 60),
898 Some(Timestamp::new_second((merge_dist as i64) * 6 * 60)),
899 ),
900 BTreeMap::from([]),
901 None,
902 ),
903 ];
904 for (lower_bounds, (window_size, expire_lower_bound), expected, expected_filter_expr) in
907 testcases
908 {
909 let mut dirty = DirtyTimeWindows::default();
910 dirty.add_lower_bounds(lower_bounds.into_iter());
911 dirty
912 .merge_dirty_time_windows(window_size, expire_lower_bound)
913 .unwrap();
914 assert_eq!(expected, dirty.windows);
915 let filter_expr = dirty
916 .gen_filter_exprs(
917 "ts",
918 expire_lower_bound,
919 window_size,
920 dirty.max_filter_num_per_query,
921 0,
922 None,
923 )
924 .unwrap()
925 .map(|e| e.expr);
926
927 let unparser = datafusion::sql::unparser::Unparser::default();
928 let to_sql = filter_expr
929 .as_ref()
930 .map(|e| unparser.expr_to_sql(e).unwrap().to_string());
931 assert_eq!(expected_filter_expr, to_sql.as_deref());
932 }
933 }
934
935 #[tokio::test]
936 async fn test_align_time_window() {
937 type TimeWindow = (Timestamp, Option<Timestamp>);
938 struct TestCase {
939 sql: String,
940 aligns: Vec<(TimeWindow, TimeWindow)>,
941 }
942 let testcases: Vec<TestCase> = vec![TestCase{
943 sql: "SELECT date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window;".to_string(),
944 aligns: vec![
945 ((Timestamp::new_second(3), None), (Timestamp::new_second(0), None)),
946 ((Timestamp::new_second(8), None), (Timestamp::new_second(5), None)),
947 ((Timestamp::new_second(8), Some(Timestamp::new_second(10))), (Timestamp::new_second(5), Some(Timestamp::new_second(10)))),
948 ((Timestamp::new_second(8), Some(Timestamp::new_second(9))), (Timestamp::new_second(5), Some(Timestamp::new_second(10)))),
949 ],
950 }];
951
952 let query_engine = create_test_query_engine();
953 let ctx = QueryContext::arc();
954 for TestCase { sql, aligns } in testcases {
955 let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), &sql, true)
956 .await
957 .unwrap();
958
959 let (column_name, time_window_expr, _, df_schema) = find_time_window_expr(
960 &plan,
961 query_engine.engine_state().catalog_manager().clone(),
962 ctx.clone(),
963 )
964 .await
965 .unwrap();
966
967 let time_window_expr = time_window_expr
968 .map(|expr| {
969 TimeWindowExpr::from_expr(
970 &expr,
971 &column_name,
972 &df_schema,
973 &query_engine.engine_state().session_state(),
974 )
975 })
976 .transpose()
977 .unwrap()
978 .unwrap();
979
980 let dirty = DirtyTimeWindows::default();
981 for (before_align, expected_after_align) in aligns {
982 let after_align = dirty
983 .align_time_window(before_align.0, before_align.1, &time_window_expr)
984 .unwrap();
985 assert_eq!(expected_after_align, after_align);
986 }
987 }
988 }
989
990 #[test]
991 fn test_task_state_checkpoint_mode_and_advancement() {
992 let query_ctx = QueryContext::arc();
993 let (_tx, rx) = tokio::sync::oneshot::channel();
994 let mut state = TaskState::new(query_ctx, rx);
995
996 assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
997 assert!(state.checkpoints().is_empty());
998
999 state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
1000 assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
1001 assert_eq!(
1002 state.checkpoints(),
1003 &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
1004 );
1005
1006 state.mark_full_snapshot();
1007 assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1008 assert_eq!(
1009 state.checkpoints(),
1010 &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
1011 );
1012 }
1013
1014 #[test]
1015 fn test_disable_incremental_persists_full_snapshot_mode() {
1016 let query_ctx = QueryContext::arc();
1017 let (_tx, rx) = tokio::sync::oneshot::channel();
1018 let mut state = TaskState::new(query_ctx, rx);
1019
1020 assert!(!state.is_incremental_disabled());
1021
1022 state.disable_incremental();
1024 assert!(state.is_incremental_disabled());
1025 assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1026
1027 state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
1029 assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1030 assert_eq!(
1031 state.checkpoints(),
1032 &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
1033 );
1034
1035 state.mark_full_snapshot();
1037 assert!(state.is_incremental_disabled());
1038 assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
1039 }
1040
1041 #[test]
1042 fn test_full_snapshot_checkpoint_advancement_requires_participating_regions() {
1043 let query_ctx = QueryContext::arc();
1044 let (_tx, rx) = tokio::sync::oneshot::channel();
1045 let state = TaskState::new(query_ctx, rx);
1046
1047 assert!(!state.can_advance_full_snapshot_checkpoints(&BTreeSet::new(), &HashMap::new()));
1048 assert!(!state.can_advance_full_snapshot_checkpoints(
1049 &BTreeSet::from([1_u64, 2_u64]),
1050 &HashMap::from([(1_u64, 10_u64)]),
1051 ));
1052 assert!(state.can_advance_full_snapshot_checkpoints(
1053 &BTreeSet::from([1_u64, 2_u64]),
1054 &HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]),
1055 ));
1056 }
1057
1058 #[test]
1059 fn test_incremental_checkpoint_advancement_requires_participation_alignment() {
1060 let query_ctx = QueryContext::arc();
1061 let (_tx, rx) = tokio::sync::oneshot::channel();
1062 let mut state = TaskState::new(query_ctx, rx);
1063 state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
1064
1065 assert!(
1066 state.can_advance_incremental_checkpoints_with_participation(
1067 &BTreeSet::from([1_u64]),
1068 &HashMap::from([(1_u64, 11_u64)]),
1069 )
1070 );
1071 assert!(
1072 !state.can_advance_incremental_checkpoints_with_participation(
1073 &BTreeSet::from([1_u64, 2_u64]),
1074 &HashMap::from([(1_u64, 11_u64)]),
1075 )
1076 );
1077 assert!(
1078 !state.can_advance_incremental_checkpoints_with_participation(
1079 &BTreeSet::from([3_u64]),
1080 &HashMap::from([(3_u64, 11_u64)]),
1081 )
1082 );
1083 assert!(
1084 !state.can_advance_incremental_checkpoints_with_participation(
1085 &BTreeSet::from([1_u64]),
1086 &HashMap::from([(1_u64, 9_u64)]),
1087 )
1088 );
1089 assert!(
1090 state.can_advance_incremental_checkpoints_with_participation(
1091 &BTreeSet::from([1_u64, 2_u64]),
1092 &HashMap::from([(1_u64, 11_u64), (2_u64, 21_u64)]),
1093 )
1094 );
1095
1096 state.disable_incremental();
1097 assert!(
1098 !state.can_advance_incremental_checkpoints_with_participation(
1099 &BTreeSet::from([1_u64, 2_u64]),
1100 &HashMap::from([(1_u64, 12_u64), (2_u64, 22_u64)]),
1101 )
1102 );
1103 }
1104
1105 #[test]
1106 fn test_incremental_checkpoint_advancement_merges_participating_subset() {
1107 let query_ctx = QueryContext::arc();
1108 let (_tx, rx) = tokio::sync::oneshot::channel();
1109 let mut state = TaskState::new(query_ctx, rx);
1110 state.advance_checkpoints(HashMap::from([
1111 (1_u64, 10_u64),
1112 (2_u64, 20_u64),
1113 (3_u64, 30_u64),
1114 ]));
1115
1116 state.advance_incremental_checkpoints_with_participation(
1117 &BTreeSet::from([1_u64, 3_u64]),
1118 HashMap::from([(1_u64, 12_u64), (3_u64, 35_u64)]),
1119 );
1120
1121 assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
1122 assert_eq!(
1123 state.checkpoints(),
1124 &BTreeMap::from([(1_u64, 12_u64), (2_u64, 20_u64), (3_u64, 35_u64)])
1125 );
1126 }
1127
1128 #[test]
1129 fn test_filter_expr_info_predicate_for_col_empty_ranges() {
1130 let filter = FilterExprInfo {
1131 expr: datafusion_expr::col("ts"),
1132 col_name: "ts".to_string(),
1133 time_ranges: vec![],
1134 window_size: chrono::Duration::seconds(1),
1135 };
1136
1137 assert!(filter.predicate_for_col("time_window").unwrap().is_none());
1138 }
1139
1140 #[test]
1141 fn test_filter_expr_info_predicate_for_col_single_range() {
1142 let filter = FilterExprInfo {
1143 expr: datafusion_expr::col("ts"),
1144 col_name: "ts".to_string(),
1145 time_ranges: vec![(Timestamp::new_second(0), Timestamp::new_second(1))],
1146 window_size: chrono::Duration::seconds(1),
1147 };
1148
1149 let predicate = filter.predicate_for_col("time_window").unwrap().unwrap();
1150 let unparser = datafusion::sql::unparser::Unparser::default();
1151 assert_eq!(
1152 "((time_window >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:01' AS TIMESTAMP)))",
1153 unparser.expr_to_sql(&predicate).unwrap().to_string()
1154 );
1155 }
1156
1157 #[test]
1158 fn test_filter_expr_info_predicate_for_col_multiple_ranges() {
1159 let filter = FilterExprInfo {
1160 expr: datafusion_expr::col("ts"),
1161 col_name: "ts".to_string(),
1162 time_ranges: vec![
1163 (Timestamp::new_second(0), Timestamp::new_second(1)),
1164 (Timestamp::new_second(10), Timestamp::new_second(11)),
1165 ],
1166 window_size: chrono::Duration::seconds(1),
1167 };
1168
1169 let predicate = filter.predicate_for_col("time_window").unwrap().unwrap();
1170 let unparser = datafusion::sql::unparser::Unparser::default();
1171 assert_eq!(
1172 "(((time_window >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:01' AS TIMESTAMP))) OR ((time_window >= CAST('1970-01-01 00:00:10' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:11' AS TIMESTAMP))))",
1173 unparser.expr_to_sql(&predicate).unwrap().to_string()
1174 );
1175 }
1176
1177 fn state_with_past_update(age: Duration) -> TaskState {
1179 let query_ctx = QueryContext::arc();
1180 let (_tx, rx) = tokio::sync::oneshot::channel();
1181 let mut state = TaskState::new(query_ctx, rx);
1182 state.last_update_time = Instant::now() - age;
1183 state
1184 }
1185
1186 #[test]
1187 fn test_short_incremental_cadence_uses_min_refresh() {
1188 let state = state_with_past_update(Duration::from_secs(10));
1192
1193 let time_window_size = Some(Duration::from_secs(60)); let min_refresh = Duration::from_secs(5);
1195 let flow_id = 1;
1196
1197 let result = state.get_next_start_query_time(
1198 flow_id,
1199 &time_window_size,
1200 min_refresh,
1201 None,
1202 20,
1203 true, );
1205
1206 let expected = state.last_update_time + min_refresh;
1208 assert_eq!(result, expected);
1209 }
1210
1211 #[test]
1212 fn test_short_incremental_cadence_respects_last_query_duration() {
1213 let mut state = state_with_past_update(Duration::from_secs(10));
1214 state.last_query_duration = Duration::from_secs(20);
1215
1216 let time_window_size = Some(Duration::from_secs(60));
1217 let min_refresh = Duration::from_secs(5);
1218 let flow_id = 1;
1219
1220 let result = state.get_next_start_query_time(
1221 flow_id,
1222 &time_window_size,
1223 min_refresh,
1224 None,
1225 20,
1226 true,
1227 );
1228
1229 assert_eq!(result, state.last_update_time + state.last_query_duration);
1230 }
1231
1232 #[test]
1233 fn test_short_incremental_cadence_respects_max_timeout() {
1234 let mut state = state_with_past_update(Duration::from_secs(10));
1235 state.last_query_duration = Duration::from_secs(20);
1236
1237 let time_window_size = Some(Duration::from_secs(60));
1238 let min_refresh = Duration::from_secs(30);
1239 let max_timeout = Duration::from_secs(5);
1240 let flow_id = 1;
1241
1242 let result = state.get_next_start_query_time(
1243 flow_id,
1244 &time_window_size,
1245 min_refresh,
1246 Some(max_timeout),
1247 20,
1248 true,
1249 );
1250
1251 assert_eq!(result, state.last_update_time + max_timeout);
1252 }
1253
1254 #[test]
1255 fn test_full_snapshot_ignores_short_cadence() {
1256 let mut state = state_with_past_update(Duration::from_secs(10));
1259 state.last_query_duration = Duration::from_secs(1);
1261
1262 let time_window_size = Some(Duration::from_secs(60)); let min_refresh = Duration::from_secs(5);
1264 let flow_id = 1;
1265
1266 let result = state.get_next_start_query_time(
1267 flow_id,
1268 &time_window_size,
1269 min_refresh,
1270 None,
1271 20,
1272 false, );
1274
1275 let expected = state.last_update_time + Duration::from_secs(60);
1278 assert_eq!(result, expected);
1279 }
1280
1281 #[test]
1282 fn test_dirty_window_overflow_schedules_immediately_even_with_short_cadence() {
1283 let mut state = state_with_past_update(Duration::from_secs(10));
1286 state
1288 .dirty_time_windows
1289 .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(3600)));
1290
1291 let time_window_size = Some(Duration::from_secs(1)); let min_refresh = Duration::from_secs(5);
1293 let flow_id = 1;
1294
1295 let result = state.get_next_start_query_time(
1297 flow_id,
1298 &time_window_size,
1299 min_refresh,
1300 None,
1301 1, true,
1303 );
1304 assert!(
1305 result <= Instant::now(),
1306 "dirty overflow should schedule immediately"
1307 );
1308
1309 let result2 = state.get_next_start_query_time(
1311 flow_id,
1312 &time_window_size,
1313 min_refresh,
1314 None,
1315 1,
1316 false,
1317 );
1318 assert!(
1319 result2 <= Instant::now(),
1320 "dirty overflow should schedule immediately"
1321 );
1322 }
1323
1324 #[test]
1325 fn test_incremental_disabled_ignores_short_cadence() {
1326 let mut state = state_with_past_update(Duration::from_secs(10));
1336 state.last_query_duration = Duration::from_secs(1);
1337
1338 let time_window_size = Some(Duration::from_secs(60));
1339 let min_refresh = Duration::from_secs(5);
1340 let flow_id = 1;
1341
1342 let result = state.get_next_start_query_time(
1343 flow_id,
1344 &time_window_size,
1345 min_refresh,
1346 None,
1347 20,
1348 false, );
1350
1351 let expected = state.last_update_time + Duration::from_secs(60);
1353 assert_eq!(result, expected);
1354 }
1355}