1use std::ops::Add;
16use std::sync::Arc;
17use std::time::Duration;
18
19use backon::{BackoffBuilder, ExponentialBuilder};
20use common_error::ext::PlainError;
21use common_error::status_code::StatusCode;
22use common_event_recorder::EventRecorderRef;
23use common_telemetry::tracing::warn;
24use common_telemetry::tracing_context::{FutureExt, TracingContext};
25use common_telemetry::{debug, error, info, tracing};
26use rand::Rng;
27use snafu::ResultExt;
28use tokio::time;
29
30use crate::error::{self, ProcedurePanicSnafu, Result, RollbackTimesExceededSnafu};
31use crate::local::{ManagerContext, ProcedureMeta, ProcedureMetaRef};
32use crate::procedure::{Output, StringKey};
33use crate::rwlock::OwnedKeyRwLockGuard;
34use crate::store::{ProcedureMessage, ProcedureStore};
35use crate::{
36 BoxedProcedure, Context, Error, Procedure, ProcedureId, ProcedureState, ProcedureWithId, Status,
37};
38
39struct ProcedureGuard {
41 meta: ProcedureMetaRef,
42 manager_ctx: Arc<ManagerContext>,
43 key_guards: Vec<OwnedKeyRwLockGuard>,
44 finish: bool,
45}
46
47impl ProcedureGuard {
48 fn new(meta: ProcedureMetaRef, manager_ctx: Arc<ManagerContext>) -> ProcedureGuard {
50 ProcedureGuard {
51 meta,
52 manager_ctx,
53 key_guards: vec![],
54 finish: false,
55 }
56 }
57
58 fn finish(mut self) {
60 self.finish = true;
61 }
62}
63
64impl Drop for ProcedureGuard {
65 fn drop(&mut self) {
66 if !self.finish {
67 error!("Procedure {} exits unexpectedly", self.meta.id);
68
69 let err = ProcedurePanicSnafu {
73 procedure_id: self.meta.id,
74 }
75 .build();
76 self.meta.set_state(ProcedureState::failed(Arc::new(err)));
77 }
78
79 if let Some(parent_id) = self.meta.parent_id {
81 self.manager_ctx.notify_by_subprocedure(parent_id);
82 }
83
84 while !self.key_guards.is_empty() {
86 self.key_guards.pop();
87 }
88
89 self.manager_ctx
91 .key_lock
92 .clean_keys(self.meta.lock_key.keys_to_lock().map(|k| k.as_string()));
93 }
94}
95
96fn find_lock_conflicts<'a>(
102 parent_keys: impl Iterator<Item = &'a StringKey>,
103 child_keys: impl Iterator<Item = &'a StringKey>,
104) -> Vec<String> {
105 use std::collections::HashMap;
106
107 let mut parent_map = HashMap::new();
109 for key in parent_keys {
110 match key {
111 StringKey::Exclusive(k) => {
112 parent_map.insert(k.as_str(), true);
113 }
114 StringKey::Share(k) => {
115 parent_map.entry(k.as_str()).or_insert(false);
116 }
117 }
118 }
119
120 child_keys
121 .filter_map(|child_key| match child_key {
122 StringKey::Exclusive(k) | StringKey::Share(k)
123 if parent_map.get(k.as_str()) == Some(&true) =>
124 {
125 Some(k.clone())
126 }
127 StringKey::Exclusive(k) if parent_map.get(k.as_str()) == Some(&false) => {
128 Some(k.clone())
129 }
130 _ => None,
131 })
132 .collect()
133}
134
135pub(crate) struct Runner {
136 pub(crate) meta: ProcedureMetaRef,
137 pub(crate) procedure: BoxedProcedure,
138 pub(crate) manager_ctx: Arc<ManagerContext>,
139 pub(crate) step: u32,
140 pub(crate) exponential_builder: ExponentialBuilder,
141 pub(crate) store: Arc<ProcedureStore>,
142 pub(crate) rolling_back: bool,
143 pub(crate) event_recorder: Option<EventRecorderRef>,
144}
145
146impl Runner {
147 pub(crate) fn running(&self) -> bool {
149 self.manager_ctx.running()
150 }
151
152 pub(crate) async fn run(mut self) {
154 let mut guard = ProcedureGuard::new(self.meta.clone(), self.manager_ctx.clone());
156
157 info!(
158 "Runner {}-{} starts",
159 self.procedure.type_name(),
160 self.meta.id
161 );
162
163 for key in self.meta.lock_key.keys_to_lock() {
166 let key_guard = match key {
168 StringKey::Share(key) => self.manager_ctx.key_lock.read(key.clone()).await.into(),
169 StringKey::Exclusive(key) => {
170 self.manager_ctx.key_lock.write(key.clone()).await.into()
171 }
172 };
173
174 guard.key_guards.push(key_guard);
175 }
176
177 self.meta.set_start_time_ms();
180 self.execute_procedure_in_loop().await;
181 self.meta.set_end_time_ms();
182
183 guard.finish();
190
191 if self.meta.parent_id.is_none() {
193 let procedure_ids = self.manager_ctx.procedures_in_tree(&self.meta);
194 self.manager_ctx.on_procedures_finish(&procedure_ids);
196
197 if !self.running() {
199 return;
200 }
201
202 for id in procedure_ids {
203 if let Err(e) = self.store.delete_procedure(id).await {
204 error!(
205 e;
206 "Runner {}-{} failed to delete procedure {}",
207 self.procedure.type_name(),
208 self.meta.id,
209 id,
210 );
211 }
212 }
213 }
214
215 info!(
216 "Runner {}-{} exits",
217 self.procedure.type_name(),
218 self.meta.id
219 );
220 }
221
222 async fn execute_procedure_in_loop(&mut self) {
223 let ctx = Context {
224 procedure_id: self.meta.id,
225 provider: self.manager_ctx.clone(),
226 };
227
228 self.rolling_back = false;
229 self.execute_once_with_retry(&ctx).await;
230 }
231
232 async fn execute_once_with_retry(&mut self, ctx: &Context) {
233 let mut retry = self.exponential_builder.build();
234 let mut retry_times = 0;
235
236 let mut rollback = self.exponential_builder.build();
237 let mut rollback_times = 0;
238
239 loop {
240 if !self.running() {
242 self.meta.set_state(ProcedureState::failed(Arc::new(
243 error::ManagerNotStartSnafu {}.build(),
244 )));
245 return;
246 }
247 let state = self.meta.state();
248 match state {
249 ProcedureState::Running => {}
250 ProcedureState::Retrying { error } => {
251 retry_times += 1;
252 if let Some(d) = retry.next() {
253 let millis = d.as_millis() as u64;
254 let noise = rand::rng().random_range(0..(millis / 4) + 1);
256 let d = d.add(Duration::from_millis(noise));
257
258 self.wait_on_err(d, retry_times).await;
259 } else {
260 self.meta
261 .set_state(ProcedureState::prepare_rollback(Arc::new(
262 Error::RetryTimesExceeded {
263 source: error.clone(),
264 procedure_id: self.meta.id,
265 },
266 )));
267 }
268 }
269 ProcedureState::PrepareRollback { error }
270 | ProcedureState::RollingBack { error } => {
271 rollback_times += 1;
272 if let Some(d) = rollback.next() {
273 self.wait_on_err(d, rollback_times).await;
274 } else {
275 let err = Err::<(), Arc<Error>>(error)
276 .context(RollbackTimesExceededSnafu {
277 procedure_id: self.meta.id,
278 })
279 .unwrap_err();
280 self.meta.set_state(ProcedureState::failed(Arc::new(err)));
281 return;
282 }
283 }
284 ProcedureState::Done { .. } => return,
285 ProcedureState::Failed { .. } => return,
286 ProcedureState::Poisoned { .. } => return,
287 }
288 self.execute_once(ctx).await;
289 }
290 }
291
292 async fn clean_poisons(&mut self) -> Result<()> {
293 let mut error = None;
294 for key in self.meta.poison_keys.iter() {
295 let key = key.to_string();
296 if let Err(e) = self
297 .manager_ctx
298 .poison_manager
299 .delete_poison(key, self.meta.id.to_string())
300 .await
301 {
302 error!(e; "Failed to clean poisons for procedure: {}", self.meta.id);
303 error = Some(e);
304 }
305 }
306
307 if let Some(e) = error {
309 return Err(e);
310 }
311 Ok(())
312 }
313
314 async fn rollback(&mut self, ctx: &Context, err: Arc<Error>) {
315 if self.procedure.rollback_supported()
316 && let Err(e) = self.procedure.rollback(ctx).await
317 {
318 self.meta
319 .set_state(ProcedureState::rolling_back(Arc::new(e)));
320 return;
321 }
322 self.meta.set_state(ProcedureState::failed(err));
323 }
324
325 async fn prepare_rollback(&mut self, err: Arc<Error>) {
326 if let Err(e) = self.write_rollback_procedure_state(err.to_string()).await {
327 self.meta
328 .set_state(ProcedureState::prepare_rollback(Arc::new(e)));
329 return;
330 }
331 if self.procedure.rollback_supported() {
332 self.meta.set_state(ProcedureState::rolling_back(err));
333 } else {
334 self.meta.set_state(ProcedureState::failed(err));
335 }
336 }
337
338 async fn execute_once(&mut self, ctx: &Context) {
339 match self.meta.state() {
340 ProcedureState::Running | ProcedureState::Retrying { .. } => {
341 match self.procedure.execute(ctx).await {
342 Ok(status) => {
343 debug!(
344 "Execute procedure {}-{} once, status: {:?}, need_persist: {}",
345 self.procedure.type_name(),
346 self.meta.id,
347 status,
348 status.need_persist(),
349 );
350
351 if !self.running() {
353 self.meta.set_state(ProcedureState::failed(Arc::new(
354 error::ManagerNotStartSnafu {}.build(),
355 )));
356 return;
357 }
358
359 if status.need_clean_poisons()
361 && let Err(e) = self.clean_poisons().await
362 {
363 error!(e; "Failed to clean poison for procedure: {}", self.meta.id);
364 self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
365 return;
366 }
367
368 if status.need_persist()
369 && let Err(e) = self.persist_procedure().await
370 {
371 error!(e; "Failed to persist procedure: {}", self.meta.id);
372 self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
373 return;
374 }
375
376 match status {
377 Status::Executing { .. } => {
378 let prev_state = self.meta.state();
379 if !matches!(prev_state, ProcedureState::Running) {
380 info!(
381 "Set Procedure {}-{} state to running, prev_state: {:?}",
382 self.procedure.type_name(),
383 self.meta.id,
384 prev_state
385 );
386 self.meta.set_state(ProcedureState::Running);
387 }
388 }
389 Status::Suspended { subprocedures, .. } => {
390 let prev_state = self.meta.state();
391 if !matches!(prev_state, ProcedureState::Running) {
392 info!(
393 "Set Procedure {}-{} state to running, prev_state: {:?}",
394 self.procedure.type_name(),
395 self.meta.id,
396 prev_state
397 );
398 self.meta.set_state(ProcedureState::Running);
399 }
400 self.on_suspended(subprocedures).await;
401 }
402 Status::Done { output } => {
403 if let Err(e) = self.commit_procedure().await {
404 error!(e; "Failed to commit procedure: {}", self.meta.id);
405 self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
406 return;
407 }
408
409 self.done(output);
410 }
411 Status::Poisoned { error, keys } => {
412 error!(
413 error;
414 "Procedure {}-{} is poisoned, keys: {:?}",
415 self.procedure.type_name(),
416 self.meta.id,
417 keys,
418 );
419 self.meta
420 .set_state(ProcedureState::poisoned(keys, Arc::new(error)));
421 }
422 }
423 }
424 Err(e) => {
425 error!(
426 e;
427 "Failed to execute procedure {}-{}, retry: {}, clean_poisons: {}",
428 self.procedure.type_name(),
429 self.meta.id,
430 e.is_retry_later(),
431 e.need_clean_poisons(),
432 );
433
434 if !self.running() {
436 self.meta.set_state(ProcedureState::failed(Arc::new(
437 error::ManagerNotStartSnafu {}.build(),
438 )));
439 return;
440 }
441
442 if e.need_clean_poisons() {
443 if let Err(e) = self.clean_poisons().await {
444 error!(e; "Failed to clean poison for procedure: {}", self.meta.id);
445 self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
446 return;
447 }
448 debug!(
449 "Procedure {}-{} cleaned poisons",
450 self.procedure.type_name(),
451 self.meta.id,
452 );
453 }
454
455 if e.is_retry_later() {
456 self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
457 return;
458 }
459
460 if self.procedure.rollback_supported() {
461 self.meta
462 .set_state(ProcedureState::prepare_rollback(Arc::new(e)));
463 } else {
464 self.meta.set_state(ProcedureState::failed(Arc::new(e)));
465 }
466 }
467 }
468 }
469 ProcedureState::PrepareRollback { error } => self.prepare_rollback(error).await,
470 ProcedureState::RollingBack { error } => self.rollback(ctx, error).await,
471 ProcedureState::Failed { .. }
472 | ProcedureState::Done { .. }
473 | ProcedureState::Poisoned { .. } => (),
474 }
475 }
476
477 fn submit_subprocedure(
479 &self,
480 procedure_id: ProcedureId,
481 procedure_state: ProcedureState,
482 procedure: BoxedProcedure,
483 ) {
484 if !self.running() {
485 warn!(
486 "ProcedureManager is not running, skip submitting subprocedure {}-{}",
487 procedure.type_name(),
488 procedure_id
489 );
490 return;
491 }
492
493 if self.manager_ctx.contains_procedure(procedure_id) {
494 return;
496 }
497
498 let step = 0;
499
500 let meta = Arc::new(ProcedureMeta::new(
501 procedure_id,
502 procedure_state,
503 Some(self.meta.id),
504 procedure.lock_key(),
505 procedure.poison_keys(),
506 procedure.type_name(),
507 self.event_recorder.clone(),
508 procedure.user_metadata(),
509 ));
510 let runner = Runner {
511 meta: meta.clone(),
512 procedure,
513 manager_ctx: self.manager_ctx.clone(),
514 step,
515 exponential_builder: self.exponential_builder,
516 store: self.store.clone(),
517 rolling_back: false,
518 event_recorder: self.event_recorder.clone(),
519 };
520
521 assert!(
525 self.manager_ctx.try_insert_procedure(meta),
526 "Procedure {}-{} submit an existing procedure {}-{}",
527 self.procedure.type_name(),
528 self.meta.id,
529 runner.procedure.type_name(),
530 procedure_id,
531 );
532
533 let parent_id = self.meta.id;
534
535 let tracing_context = TracingContext::from_current_span();
536 if !self.manager_ctx.spawn_runner_task(procedure_id, || {
537 common_runtime::spawn_global(async move {
538 let span = tracing_context.attach(tracing::info_span!(
539 "LocalManager::submit_subprocedure",
540 procedure_name = %runner.meta.type_name,
541 procedure_id = %runner.meta.id,
542 parent_id = %parent_id,
543 ));
544 runner.run().trace(span).await
548 })
549 }) {
550 self.manager_ctx.remove_procedure(procedure_id);
551 return;
552 }
553
554 self.meta.push_child(procedure_id);
556 }
557
558 async fn wait_on_err(&mut self, d: Duration, i: u64) {
560 info!(
561 "Procedure {}-{} retry for the {} times after {} millis",
562 self.procedure.type_name(),
563 self.meta.id,
564 i,
565 d.as_millis(),
566 );
567 time::sleep(d).await;
568 }
569
570 async fn on_suspended(&mut self, subprocedures: Vec<ProcedureWithId>) {
571 let has_child = !subprocedures.is_empty();
572
573 for sub in &subprocedures {
578 let conflicting = find_lock_conflicts(
579 self.meta.lock_key.keys_to_lock(),
580 sub.procedure.lock_key().keys_to_lock(),
581 );
582 if !conflicting.is_empty() {
583 let err_msg = format!(
584 "Deadlock prevented: subprocedure {}-{} shares conflicting lock key(s) {:?} \
585 with parent {}-{}. Parent holds these locks and would wait for child \
586 completion, but child cannot acquire them.",
587 sub.procedure.type_name(),
588 sub.id,
589 conflicting,
590 self.procedure.type_name(),
591 self.meta.id,
592 );
593 error!("{}", err_msg);
594 let err = Arc::new(Error::external(PlainError::new(
595 err_msg,
596 StatusCode::Internal,
597 )));
598 if self.procedure.rollback_supported() {
599 self.meta.set_state(ProcedureState::prepare_rollback(err));
600 } else {
601 self.meta.set_state(ProcedureState::failed(err));
602 }
603 return;
604 }
605 }
606
607 for subprocedure in subprocedures {
608 info!(
609 "Procedure {}-{} submit subprocedure {}-{}",
610 self.procedure.type_name(),
611 self.meta.id,
612 subprocedure.procedure.type_name(),
613 subprocedure.id,
614 );
615
616 self.submit_subprocedure(
617 subprocedure.id,
618 ProcedureState::Running,
619 subprocedure.procedure,
620 );
621 }
622
623 info!(
624 "Procedure {}-{} is waiting for subprocedures",
625 self.procedure.type_name(),
626 self.meta.id,
627 );
628
629 if has_child {
631 self.meta.child_notify.notified().await;
632
633 info!(
634 "Procedure {}-{} is waked up",
635 self.procedure.type_name(),
636 self.meta.id,
637 );
638 }
639 }
640
641 async fn persist_procedure(&mut self) -> Result<()> {
642 let type_name = self.procedure.type_name().to_string();
643 let data = self.procedure.dump()?;
644
645 self.store
646 .store_procedure(
647 self.meta.id,
648 self.step,
649 type_name,
650 data,
651 self.meta.parent_id,
652 )
653 .await
654 .map_err(|e| {
655 error!(
656 e; "Failed to persist procedure {}-{}",
657 self.procedure.type_name(),
658 self.meta.id
659 );
660 e
661 })?;
662 self.step += 1;
663 Ok(())
664 }
665
666 async fn commit_procedure(&mut self) -> Result<()> {
667 self.store
668 .commit_procedure(self.meta.id, self.step)
669 .await
670 .map_err(|e| {
671 error!(
672 e; "Failed to commit procedure {}-{}",
673 self.procedure.type_name(),
674 self.meta.id
675 );
676 e
677 })?;
678 self.step += 1;
679 Ok(())
680 }
681
682 async fn write_rollback_procedure_state(&mut self, error: String) -> Result<()> {
683 let type_name = self.procedure.type_name().to_string();
685 let data = self.procedure.dump()?;
686 let message = ProcedureMessage {
687 type_name,
688 data,
689 parent_id: self.meta.parent_id,
690 step: self.step,
691 error: Some(error),
692 };
693 self.store
694 .rollback_procedure(self.meta.id, message)
695 .await
696 .map_err(|e| {
697 error!(
698 e; "Failed to write rollback key for procedure {}-{}",
699 self.procedure.type_name(),
700 self.meta.id
701 );
702 e
703 })?;
704 self.step += 1;
705 Ok(())
706 }
707
708 fn done(&self, output: Option<Output>) {
709 info!(
711 "Procedure {}-{} done",
712 self.procedure.type_name(),
713 self.meta.id,
714 );
715
716 self.meta.set_state(ProcedureState::Done { output });
718 }
719}
720
721impl Drop for Runner {
722 fn drop(&mut self) {
723 self.manager_ctx.remove_runner_task(self.meta.id);
724 }
725}
726
727#[cfg(test)]
728mod tests {
729 use std::assert_matches;
730 use std::sync::Arc;
731 use std::sync::atomic::{AtomicU64, Ordering};
732
733 use async_trait::async_trait;
734 use common_error::ext::{ErrorExt, PlainError};
735 use common_error::mock::MockError;
736 use common_error::status_code::StatusCode;
737 use common_test_util::temp_dir::create_temp_dir;
738 use futures::future::join_all;
739 use futures_util::FutureExt;
740 use futures_util::future::BoxFuture;
741 use object_store::{EntryMode, ObjectStore};
742 use tokio::sync::mpsc;
743 use tokio::sync::watch::Receiver;
744
745 use super::*;
746 use crate::local::{DynamicKeyLockGuard, test_util};
747 use crate::procedure::PoisonKeys;
748 use crate::store::proc_path;
749 use crate::test_util::InMemoryPoisonStore;
750 use crate::{ContextProvider, Error, LockKey, PoisonKey, Procedure};
751
752 const ROOT_ID: &str = "9f805a1f-05f7-490c-9f91-bd56e3cc54c1";
753
754 fn new_runner(
755 meta: ProcedureMetaRef,
756 procedure: BoxedProcedure,
757 store: Arc<ProcedureStore>,
758 ) -> Runner {
759 Runner {
760 meta,
761 procedure,
762 manager_ctx: Arc::new(ManagerContext::new(
763 Arc::new(InMemoryPoisonStore::default()),
764 )),
765 step: 0,
766 exponential_builder: ExponentialBuilder::default(),
767 store,
768 rolling_back: false,
769 event_recorder: None,
770 }
771 }
772
773 async fn check_files(
774 object_store: &ObjectStore,
775 procedure_store: &ProcedureStore,
776 procedure_id: ProcedureId,
777 files: &[&str],
778 ) {
779 let dir = proc_path!(procedure_store, "{procedure_id}/");
780 let lister = object_store.list(&dir).await.unwrap();
781 let mut files_in_dir: Vec<_> = lister
782 .into_iter()
783 .filter(|x| x.metadata().mode() == EntryMode::FILE)
784 .map(|de| de.name().to_string())
785 .collect();
786 files_in_dir.sort_unstable();
787 assert_eq!(files, files_in_dir);
788 }
789
790 fn context_with_provider(
791 procedure_id: ProcedureId,
792 provider: Arc<dyn ContextProvider>,
793 ) -> Context {
794 Context {
795 procedure_id,
796 provider,
797 }
798 }
799
800 fn context_without_provider(procedure_id: ProcedureId) -> Context {
801 struct MockProvider;
802
803 #[async_trait]
804 impl ContextProvider for MockProvider {
805 async fn procedure_state(
806 &self,
807 _procedure_id: ProcedureId,
808 ) -> Result<Option<ProcedureState>> {
809 unimplemented!()
810 }
811
812 async fn procedure_state_receiver(
813 &self,
814 _procedure_id: ProcedureId,
815 ) -> Result<Option<Receiver<ProcedureState>>> {
816 unimplemented!()
817 }
818
819 async fn try_put_poison(
820 &self,
821 _key: &PoisonKey,
822 _procedure_id: ProcedureId,
823 ) -> Result<()> {
824 unimplemented!()
825 }
826
827 async fn acquire_lock(&self, _key: &StringKey) -> DynamicKeyLockGuard {
828 unimplemented!()
829 }
830 }
831
832 Context {
833 procedure_id,
834 provider: Arc::new(MockProvider),
835 }
836 }
837
838 type RollbackFn = Box<dyn FnMut(Context) -> BoxFuture<'static, Result<()>> + Send>;
839
840 struct ProcedureAdapter<F> {
841 data: String,
842 lock_key: LockKey,
843 poison_keys: PoisonKeys,
844 exec_fn: F,
845 rollback_fn: Option<RollbackFn>,
846 }
847
848 impl<F> ProcedureAdapter<F> {
849 fn new_meta(&self, uuid: &str) -> ProcedureMetaRef {
850 let mut meta = test_util::procedure_meta_for_test();
851 meta.id = ProcedureId::parse_str(uuid).unwrap();
852 meta.lock_key = self.lock_key.clone();
853 meta.poison_keys = self.poison_keys.clone();
854
855 Arc::new(meta)
856 }
857 }
858
859 #[async_trait]
860 impl<F> Procedure for ProcedureAdapter<F>
861 where
862 F: FnMut(Context) -> BoxFuture<'static, Result<Status>> + Send + Sync,
863 {
864 fn type_name(&self) -> &str {
865 "ProcedureAdapter"
866 }
867
868 async fn execute(&mut self, ctx: &Context) -> Result<Status> {
869 let f = (self.exec_fn)(ctx.clone());
870 f.await
871 }
872
873 async fn rollback(&mut self, ctx: &Context) -> Result<()> {
874 if let Some(f) = &mut self.rollback_fn {
875 return (f)(ctx.clone()).await;
876 }
877 Ok(())
878 }
879
880 fn rollback_supported(&self) -> bool {
881 self.rollback_fn.is_some()
882 }
883
884 fn dump(&self) -> Result<String> {
885 Ok(self.data.clone())
886 }
887
888 fn lock_key(&self) -> LockKey {
889 self.lock_key.clone()
890 }
891
892 fn poison_keys(&self) -> PoisonKeys {
893 self.poison_keys.clone()
894 }
895 }
896
897 async fn execute_once_normal(persist: bool, first_files: &[&str], second_files: &[&str]) {
898 let mut times = 0;
899 let exec_fn = move |_| {
900 times += 1;
901 async move {
902 if times == 1 {
903 Ok(Status::executing(persist))
904 } else {
905 Ok(Status::done())
906 }
907 }
908 .boxed()
909 };
910 let normal = ProcedureAdapter {
911 data: "normal".to_string(),
912 lock_key: LockKey::single_exclusive("catalog.schema.table"),
913 poison_keys: PoisonKeys::default(),
914 exec_fn,
915 rollback_fn: None,
916 };
917
918 let dir = create_temp_dir("normal");
919 let meta = normal.new_meta(ROOT_ID);
920 let ctx = context_without_provider(meta.id);
921 let object_store = test_util::new_object_store(&dir);
922 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
923 let mut runner = new_runner(meta, Box::new(normal), procedure_store.clone());
924 runner.manager_ctx.start();
925
926 runner.execute_once(&ctx).await;
927 let state = runner.meta.state();
928 assert!(state.is_running(), "{state:?}");
929 check_files(
930 &object_store,
931 &procedure_store,
932 ctx.procedure_id,
933 first_files,
934 )
935 .await;
936
937 runner.execute_once(&ctx).await;
938 let state = runner.meta.state();
939 assert!(state.is_done(), "{state:?}");
940 check_files(
941 &object_store,
942 &procedure_store,
943 ctx.procedure_id,
944 second_files,
945 )
946 .await;
947 }
948
949 #[tokio::test]
950 async fn test_execute_once_normal() {
951 execute_once_normal(
952 true,
953 &["0000000000.step"],
954 &["0000000000.step", "0000000001.commit"],
955 )
956 .await;
957 }
958
959 #[tokio::test]
960 async fn test_execute_once_normal_skip_persist() {
961 execute_once_normal(false, &[], &["0000000000.commit"]).await;
962 }
963
964 #[tokio::test]
965 async fn test_on_suspend_empty() {
966 let exec_fn = move |_| {
967 async move {
968 Ok(Status::Suspended {
969 subprocedures: Vec::new(),
970 persist: false,
971 })
972 }
973 .boxed()
974 };
975 let suspend = ProcedureAdapter {
976 data: "suspend".to_string(),
977 lock_key: LockKey::single_exclusive("catalog.schema.table"),
978 poison_keys: PoisonKeys::default(),
979 exec_fn,
980 rollback_fn: None,
981 };
982
983 let dir = create_temp_dir("suspend");
984 let meta = suspend.new_meta(ROOT_ID);
985 let ctx = context_without_provider(meta.id);
986 let object_store = test_util::new_object_store(&dir);
987 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
988 let mut runner = new_runner(meta, Box::new(suspend), procedure_store);
989 runner.manager_ctx.start();
990
991 runner.execute_once(&ctx).await;
992 let state = runner.meta.state();
993 assert!(state.is_running(), "{state:?}");
994 }
995
996 fn new_child_procedure(procedure_id: ProcedureId, keys: &[&str]) -> ProcedureWithId {
997 let mut times = 0;
998 let exec_fn = move |_| {
999 times += 1;
1000 async move {
1001 if times == 1 {
1002 time::sleep(Duration::from_millis(200)).await;
1003 Ok(Status::executing(true))
1004 } else {
1005 Ok(Status::done())
1006 }
1007 }
1008 .boxed()
1009 };
1010 let child = ProcedureAdapter {
1011 data: "child".to_string(),
1012 lock_key: LockKey::new_exclusive(keys.iter().map(|k| k.to_string())),
1013 poison_keys: PoisonKeys::default(),
1014 exec_fn,
1015 rollback_fn: None,
1016 };
1017
1018 ProcedureWithId {
1019 id: procedure_id,
1020 procedure: Box::new(child),
1021 }
1022 }
1023
1024 #[tokio::test]
1025 async fn test_on_suspend_by_subprocedures() {
1026 let mut times = 0;
1027 let children_ids = [ProcedureId::random(), ProcedureId::random()];
1028 let keys = [
1029 &[
1030 "catalog.schema.table.region-0",
1031 "catalog.schema.table.region-1",
1032 ],
1033 &[
1034 "catalog.schema.table.region-2",
1035 "catalog.schema.table.region-3",
1036 ],
1037 ];
1038
1039 let exec_fn = move |ctx: Context| {
1040 times += 1;
1041 async move {
1042 if times == 1 {
1043 Ok(Status::Suspended {
1045 subprocedures: children_ids
1046 .into_iter()
1047 .zip(keys)
1048 .map(|(id, key_slice)| new_child_procedure(id, key_slice))
1049 .collect(),
1050 persist: true,
1051 })
1052 } else {
1053 let mut all_child_done = true;
1055 for id in children_ids {
1056 let is_not_done = ctx
1057 .provider
1058 .procedure_state(id)
1059 .await
1060 .unwrap()
1061 .map(|s| !s.is_done())
1062 .unwrap_or(true);
1063 if is_not_done {
1064 all_child_done = false;
1065 }
1066 }
1067 if all_child_done {
1068 Ok(Status::done())
1069 } else {
1070 Ok(Status::Suspended {
1072 subprocedures: Vec::new(),
1073 persist: false,
1074 })
1075 }
1076 }
1077 }
1078 .boxed()
1079 };
1080 let parent = ProcedureAdapter {
1081 data: "parent".to_string(),
1082 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1083 poison_keys: PoisonKeys::default(),
1084 exec_fn,
1085 rollback_fn: None,
1086 };
1087
1088 let dir = create_temp_dir("parent");
1089 let meta = parent.new_meta(ROOT_ID);
1090 let procedure_id = meta.id;
1091
1092 let object_store = test_util::new_object_store(&dir);
1093 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1094 let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store.clone());
1095 let poison_manager = Arc::new(InMemoryPoisonStore::default());
1096 let manager_ctx = Arc::new(ManagerContext::new(poison_manager));
1097 manager_ctx.start();
1098 assert!(manager_ctx.try_insert_procedure(meta));
1100 runner.manager_ctx = manager_ctx.clone();
1102
1103 runner.run().await;
1104 assert!(manager_ctx.key_lock.is_empty());
1105
1106 for child_id in children_ids {
1108 let state = manager_ctx.state(child_id).unwrap();
1109 assert!(state.is_done(), "{state:?}");
1110 }
1111 let state = manager_ctx.state(procedure_id).unwrap();
1112 assert!(state.is_done(), "{state:?}");
1113 check_files(&object_store, &procedure_store, procedure_id, &[]).await;
1115
1116 tokio::time::sleep(Duration::from_millis(5)).await;
1117 manager_ctx.remove_outdated_meta(Duration::from_millis(1));
1119 assert!(manager_ctx.state(procedure_id).is_none());
1120 assert!(manager_ctx.finished_procedures.lock().unwrap().is_empty());
1121 for child_id in children_ids {
1122 assert!(manager_ctx.state(child_id).is_none());
1123 }
1124 }
1125
1126 #[tokio::test]
1127 async fn test_running_is_stopped() {
1128 let exec_fn = move |_| async move { Ok(Status::executing(true)) }.boxed();
1129 let normal = ProcedureAdapter {
1130 data: "normal".to_string(),
1131 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1132 poison_keys: PoisonKeys::default(),
1133 exec_fn,
1134 rollback_fn: None,
1135 };
1136
1137 let dir = create_temp_dir("test_running_is_stopped");
1138 let meta = normal.new_meta(ROOT_ID);
1139 let ctx = context_without_provider(meta.id);
1140 let object_store = test_util::new_object_store(&dir);
1141 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1142 let mut runner = new_runner(meta, Box::new(normal), procedure_store.clone());
1143 runner.manager_ctx.start();
1144
1145 runner.execute_once(&ctx).await;
1146 let state = runner.meta.state();
1147 assert!(state.is_running(), "{state:?}");
1148 check_files(
1149 &object_store,
1150 &procedure_store,
1151 ctx.procedure_id,
1152 &["0000000000.step"],
1153 )
1154 .await;
1155
1156 runner.manager_ctx.stop();
1157 runner.execute_once(&ctx).await;
1158 let state = runner.meta.state();
1159 assert!(state.is_failed(), "{state:?}");
1160 check_files(
1162 &object_store,
1163 &procedure_store,
1164 ctx.procedure_id,
1165 &["0000000000.step"],
1166 )
1167 .await;
1168 }
1169
1170 #[tokio::test]
1171 async fn test_running_is_stopped_on_error() {
1172 let exec_fn =
1173 |_| async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }.boxed();
1174 let normal = ProcedureAdapter {
1175 data: "fail".to_string(),
1176 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1177 poison_keys: PoisonKeys::default(),
1178 exec_fn,
1179 rollback_fn: None,
1180 };
1181
1182 let dir = create_temp_dir("test_running_is_stopped_on_error");
1183 let meta = normal.new_meta(ROOT_ID);
1184 let ctx = context_without_provider(meta.id);
1185 let object_store = test_util::new_object_store(&dir);
1186 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1187 let mut runner = new_runner(meta, Box::new(normal), procedure_store.clone());
1188 runner.manager_ctx.stop();
1189
1190 runner.execute_once(&ctx).await;
1191 let state = runner.meta.state();
1192 assert!(state.is_failed(), "{state:?}");
1193 check_files(&object_store, &procedure_store, ctx.procedure_id, &[]).await;
1195 }
1196
1197 #[tokio::test]
1198 async fn test_execute_on_error() {
1199 let exec_fn =
1200 |_| async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }.boxed();
1201 let fail = ProcedureAdapter {
1202 data: "fail".to_string(),
1203 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1204 poison_keys: PoisonKeys::default(),
1205 exec_fn,
1206 rollback_fn: None,
1207 };
1208
1209 let dir = create_temp_dir("fail");
1210 let meta = fail.new_meta(ROOT_ID);
1211 let ctx = context_without_provider(meta.id);
1212 let object_store = test_util::new_object_store(&dir);
1213 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1214 let mut runner = new_runner(meta.clone(), Box::new(fail), procedure_store.clone());
1215 runner.manager_ctx.start();
1216
1217 runner.execute_once(&ctx).await;
1218 let state = runner.meta.state();
1219 assert!(state.is_failed(), "{state:?}");
1220 check_files(&object_store, &procedure_store, ctx.procedure_id, &[]).await;
1221 }
1222
1223 #[tokio::test]
1224 async fn test_execute_with_rollback_on_error() {
1225 let exec_fn =
1226 |_| async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }.boxed();
1227 let rollback_fn = move |_| async move { Ok(()) }.boxed();
1228 let fail = ProcedureAdapter {
1229 data: "fail".to_string(),
1230 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1231 poison_keys: PoisonKeys::default(),
1232 exec_fn,
1233 rollback_fn: Some(Box::new(rollback_fn)),
1234 };
1235
1236 let dir = create_temp_dir("fail");
1237 let meta = fail.new_meta(ROOT_ID);
1238 let ctx = context_without_provider(meta.id);
1239 let object_store = test_util::new_object_store(&dir);
1240 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1241 let mut runner = new_runner(meta.clone(), Box::new(fail), procedure_store.clone());
1242 runner.manager_ctx.start();
1243
1244 runner.execute_once(&ctx).await;
1245 let state = runner.meta.state();
1246 assert!(state.is_prepare_rollback(), "{state:?}");
1247
1248 runner.execute_once(&ctx).await;
1249 let state = runner.meta.state();
1250 assert!(state.is_rolling_back(), "{state:?}");
1251
1252 runner.execute_once(&ctx).await;
1253 let state = runner.meta.state();
1254 assert!(state.is_failed(), "{state:?}");
1255 check_files(
1256 &object_store,
1257 &procedure_store,
1258 ctx.procedure_id,
1259 &["0000000000.rollback"],
1260 )
1261 .await;
1262 }
1263
1264 #[tokio::test]
1265 async fn test_execute_on_retry_later_error() {
1266 let mut times = 0;
1267
1268 let exec_fn = move |_| {
1269 times += 1;
1270 async move {
1271 if times == 1 {
1272 Err(Error::retry_later(MockError::new(StatusCode::Unexpected)))
1273 } else if times == 2 {
1274 Ok(Status::executing(false))
1275 } else {
1276 Ok(Status::done())
1277 }
1278 }
1279 .boxed()
1280 };
1281
1282 let retry_later = ProcedureAdapter {
1283 data: "retry_later".to_string(),
1284 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1285 poison_keys: PoisonKeys::default(),
1286 exec_fn,
1287 rollback_fn: None,
1288 };
1289
1290 let dir = create_temp_dir("retry_later");
1291 let meta = retry_later.new_meta(ROOT_ID);
1292 let ctx = context_without_provider(meta.id);
1293 let object_store = test_util::new_object_store(&dir);
1294 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1295 let mut runner = new_runner(meta.clone(), Box::new(retry_later), procedure_store.clone());
1296 runner.manager_ctx.start();
1297 runner.execute_once(&ctx).await;
1298 let state = runner.meta.state();
1299 assert!(state.is_retrying(), "{state:?}");
1300
1301 runner.execute_once(&ctx).await;
1302 let state = runner.meta.state();
1303 assert!(state.is_running(), "{state:?}");
1304
1305 runner.execute_once(&ctx).await;
1306 let state = runner.meta.state();
1307 assert!(state.is_done(), "{state:?}");
1308 assert!(meta.state().is_done());
1309 check_files(
1310 &object_store,
1311 &procedure_store,
1312 ctx.procedure_id,
1313 &["0000000000.commit"],
1314 )
1315 .await;
1316 }
1317
1318 #[tokio::test]
1319 async fn test_retrying_state_visible_in_context_on_retry() {
1320 let retrying_states = Arc::new(std::sync::Mutex::new(Vec::new()));
1321 let captured = retrying_states.clone();
1322 let mut times = 0;
1323
1324 let exec_fn = move |ctx: Context| {
1325 times += 1;
1326 let captured = captured.clone();
1327 async move {
1328 let is_retrying = ctx.is_retrying().await;
1329 captured.lock().unwrap().push(is_retrying);
1330 if times == 1 {
1331 Err(Error::retry_later(MockError::new(StatusCode::Unexpected)))
1332 } else {
1333 Ok(Status::done())
1334 }
1335 }
1336 .boxed()
1337 };
1338
1339 let procedure = ProcedureAdapter {
1340 data: "retrying_state".to_string(),
1341 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1342 poison_keys: PoisonKeys::default(),
1343 exec_fn,
1344 rollback_fn: None,
1345 };
1346
1347 let dir = create_temp_dir("retrying_state");
1348 let meta = procedure.new_meta(ROOT_ID);
1349 let object_store = test_util::new_object_store(&dir);
1350 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store));
1351 let mut runner = new_runner(meta.clone(), Box::new(procedure), procedure_store);
1352 let ctx = context_with_provider(
1353 meta.id,
1354 runner.manager_ctx.clone() as Arc<dyn ContextProvider>,
1355 );
1356
1357 runner
1358 .manager_ctx
1359 .procedures
1360 .write()
1361 .unwrap()
1362 .insert(meta.id, runner.meta.clone());
1363 runner.manager_ctx.start();
1364
1365 runner.execute_once(&ctx).await;
1366 runner.execute_once(&ctx).await;
1367
1368 let states = retrying_states.lock().unwrap().clone();
1369 assert_eq!(states, vec![Some(false), Some(true)]);
1370 }
1371
1372 #[tokio::test(flavor = "multi_thread")]
1373 async fn test_execute_on_retry_later_error_with_child() {
1374 common_telemetry::init_default_ut_logging();
1375 let mut times = 0;
1376 let child_id = ProcedureId::random();
1377
1378 let exec_fn = move |_| {
1379 times += 1;
1380 async move {
1381 debug!("times: {}", times);
1382 if times == 1 {
1383 Err(Error::retry_later(MockError::new(StatusCode::Unexpected)))
1384 } else if times == 2 {
1385 let exec_fn = |_| {
1386 async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }
1387 .boxed()
1388 };
1389 let fail = ProcedureAdapter {
1390 data: "fail".to_string(),
1391 lock_key: LockKey::single_exclusive("catalog.schema.table.region-0"),
1392 poison_keys: PoisonKeys::default(),
1393 exec_fn,
1394 rollback_fn: None,
1395 };
1396
1397 Ok(Status::Suspended {
1398 subprocedures: vec![ProcedureWithId {
1399 id: child_id,
1400 procedure: Box::new(fail),
1401 }],
1402 persist: true,
1403 })
1404 } else {
1405 Ok(Status::done())
1406 }
1407 }
1408 .boxed()
1409 };
1410
1411 let retry_later = ProcedureAdapter {
1412 data: "retry_later".to_string(),
1413 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1414 poison_keys: PoisonKeys::default(),
1415 exec_fn,
1416 rollback_fn: None,
1417 };
1418
1419 let dir = create_temp_dir("retry_later");
1420 let meta = retry_later.new_meta(ROOT_ID);
1421 let ctx = context_without_provider(meta.id);
1422 let object_store = test_util::new_object_store(&dir);
1423 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1424 let mut runner = new_runner(meta.clone(), Box::new(retry_later), procedure_store.clone());
1425 runner.manager_ctx.start();
1426 debug!("execute_once 1");
1427 runner.execute_once(&ctx).await;
1428 let state = runner.meta.state();
1429 assert!(state.is_retrying(), "{state:?}");
1430
1431 let moved_meta = meta.clone();
1432 tokio::spawn(async move {
1433 moved_meta.child_notify.notify_one();
1434 });
1435 runner.execute_once(&ctx).await;
1436 let state = runner.meta.state();
1437 assert!(state.is_running(), "{state:?}");
1438
1439 runner.execute_once(&ctx).await;
1440 let state = runner.meta.state();
1441 assert!(state.is_done(), "{state:?}");
1442 assert!(meta.state().is_done());
1443 check_files(
1444 &object_store,
1445 &procedure_store,
1446 ctx.procedure_id,
1447 &["0000000000.step", "0000000001.commit"],
1448 )
1449 .await;
1450 }
1451
1452 #[tokio::test]
1453 async fn test_execute_exceed_max_retry_later() {
1454 let exec_fn =
1455 |_| async { Err(Error::retry_later(MockError::new(StatusCode::Unexpected))) }.boxed();
1456
1457 let exceed_max_retry_later = ProcedureAdapter {
1458 data: "exceed_max_retry_later".to_string(),
1459 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1460 poison_keys: PoisonKeys::default(),
1461 exec_fn,
1462 rollback_fn: None,
1463 };
1464
1465 let dir = create_temp_dir("exceed_max_retry_later");
1466 let meta = exceed_max_retry_later.new_meta(ROOT_ID);
1467 let object_store = test_util::new_object_store(&dir);
1468 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1469 let mut runner = new_runner(
1470 meta.clone(),
1471 Box::new(exceed_max_retry_later),
1472 procedure_store,
1473 );
1474 runner.manager_ctx.start();
1475
1476 runner.exponential_builder = ExponentialBuilder::default()
1477 .with_min_delay(Duration::from_millis(1))
1478 .with_max_times(3);
1479
1480 runner.execute_procedure_in_loop().await;
1482 let err = meta.state().error().unwrap().to_string();
1483 assert!(err.contains("Procedure retry exceeded max times"));
1484 }
1485
1486 #[tokio::test]
1487 async fn test_rollback_exceed_max_retry_later() {
1488 let exec_fn =
1489 |_| async { Err(Error::retry_later(MockError::new(StatusCode::Unexpected))) }.boxed();
1490 let rollback_fn = move |_| {
1491 async move { Err(Error::retry_later(MockError::new(StatusCode::Unexpected))) }.boxed()
1492 };
1493 let exceed_max_retry_later = ProcedureAdapter {
1494 data: "exceed_max_rollback".to_string(),
1495 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1496 poison_keys: PoisonKeys::default(),
1497 exec_fn,
1498 rollback_fn: Some(Box::new(rollback_fn)),
1499 };
1500
1501 let dir = create_temp_dir("exceed_max_rollback");
1502 let meta = exceed_max_retry_later.new_meta(ROOT_ID);
1503 let object_store = test_util::new_object_store(&dir);
1504 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1505 let mut runner = new_runner(
1506 meta.clone(),
1507 Box::new(exceed_max_retry_later),
1508 procedure_store,
1509 );
1510 runner.manager_ctx.start();
1511 runner.exponential_builder = ExponentialBuilder::default()
1512 .with_min_delay(Duration::from_millis(1))
1513 .with_max_times(3);
1514
1515 runner.execute_procedure_in_loop().await;
1517 let err = meta.state().error().unwrap().to_string();
1518 assert!(err.contains("Procedure rollback exceeded max times"));
1519 }
1520
1521 #[tokio::test]
1522 async fn test_rollback_after_retry_fail() {
1523 let exec_fn = move |_| {
1524 async move { Err(Error::retry_later(MockError::new(StatusCode::Unexpected))) }.boxed()
1525 };
1526
1527 let (tx, mut rx) = mpsc::channel(1);
1528 let rollback_fn = move |_| {
1529 let tx = tx.clone();
1530 async move {
1531 tx.send(()).await.unwrap();
1532 Ok(())
1533 }
1534 .boxed()
1535 };
1536 let retry_later = ProcedureAdapter {
1537 data: "rollback_after_retry_fail".to_string(),
1538 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1539 poison_keys: PoisonKeys::default(),
1540 exec_fn,
1541 rollback_fn: Some(Box::new(rollback_fn)),
1542 };
1543
1544 let dir = create_temp_dir("retry_later");
1545 let meta = retry_later.new_meta(ROOT_ID);
1546 let ctx = context_without_provider(meta.id);
1547 let object_store = test_util::new_object_store(&dir);
1548 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1549 let mut runner = new_runner(meta.clone(), Box::new(retry_later), procedure_store.clone());
1550 runner.manager_ctx.start();
1551 runner.exponential_builder = ExponentialBuilder::default()
1552 .with_min_delay(Duration::from_millis(1))
1553 .with_max_times(3);
1554 runner.execute_procedure_in_loop().await;
1556 rx.recv().await.unwrap();
1557 assert_eq!(rx.try_recv().unwrap_err(), mpsc::error::TryRecvError::Empty);
1558 check_files(
1559 &object_store,
1560 &procedure_store,
1561 ctx.procedure_id,
1562 &["0000000000.rollback"],
1563 )
1564 .await;
1565 }
1566
1567 #[tokio::test]
1568 async fn test_child_error() {
1569 let mut times = 0;
1570 let child_id = ProcedureId::random();
1571 common_telemetry::init_default_ut_logging();
1572 let exec_fn = move |ctx: Context| {
1573 times += 1;
1574 async move {
1575 if times == 1 {
1576 let exec_fn = |_| {
1578 async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }
1579 .boxed()
1580 };
1581 let fail = ProcedureAdapter {
1582 data: "fail".to_string(),
1583 lock_key: LockKey::single_exclusive("catalog.schema.table.region-0"),
1584 poison_keys: PoisonKeys::default(),
1585 exec_fn,
1586 rollback_fn: None,
1587 };
1588
1589 Ok(Status::Suspended {
1590 subprocedures: vec![ProcedureWithId {
1591 id: child_id,
1592 procedure: Box::new(fail),
1593 }],
1594 persist: true,
1595 })
1596 } else {
1597 let state = ctx.provider.procedure_state(child_id).await.unwrap();
1599 let is_failed = state.map(|s| s.is_failed()).unwrap_or(false);
1600 if is_failed {
1601 Err(Error::from_error_ext(PlainError::new(
1603 "subprocedure failed".to_string(),
1604 StatusCode::Unexpected,
1605 )))
1606 } else {
1607 Ok(Status::Suspended {
1609 subprocedures: Vec::new(),
1610 persist: false,
1611 })
1612 }
1613 }
1614 }
1615 .boxed()
1616 };
1617 let parent = ProcedureAdapter {
1618 data: "parent".to_string(),
1619 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1620 poison_keys: PoisonKeys::default(),
1621 exec_fn,
1622 rollback_fn: None,
1623 };
1624
1625 let dir = create_temp_dir("child_err");
1626 let meta = parent.new_meta(ROOT_ID);
1627
1628 let object_store = test_util::new_object_store(&dir);
1629 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1630 let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store);
1631 let poison_manager = Arc::new(InMemoryPoisonStore::default());
1632 let manager_ctx = Arc::new(ManagerContext::new(poison_manager));
1633 manager_ctx.start();
1634 assert!(manager_ctx.try_insert_procedure(meta.clone()));
1636 runner.manager_ctx = manager_ctx.clone();
1638
1639 runner.run().await;
1641 assert!(manager_ctx.key_lock.is_empty());
1642 let err = meta.state().error().unwrap().output_msg();
1643 assert!(err.contains("subprocedure failed"), "{err}");
1644 }
1645
1646 #[tokio::test]
1647 async fn test_execute_with_clean_poisons() {
1648 common_telemetry::init_default_ut_logging();
1649 let mut times = 0;
1650 let poison_key = PoisonKey::new("table/1024");
1651 let moved_poison_key = poison_key.clone();
1652 let exec_fn = move |ctx: Context| {
1653 times += 1;
1654 let poison_key = moved_poison_key.clone();
1655 async move {
1656 if times == 1 {
1657 ctx.provider
1659 .try_put_poison(&poison_key, ctx.procedure_id)
1660 .await
1661 .unwrap();
1662
1663 Ok(Status::executing(true))
1664 } else {
1665 Ok(Status::executing_with_clean_poisons(true))
1666 }
1667 }
1668 .boxed()
1669 };
1670 let poison = ProcedureAdapter {
1671 data: "poison".to_string(),
1672 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1673 poison_keys: PoisonKeys::new(vec![poison_key.clone()]),
1674 exec_fn,
1675 rollback_fn: None,
1676 };
1677
1678 let dir = create_temp_dir("clean_poisons");
1679 let meta = poison.new_meta(ROOT_ID);
1680
1681 let object_store = test_util::new_object_store(&dir);
1682 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1683 let mut runner = new_runner(meta.clone(), Box::new(poison), procedure_store.clone());
1684
1685 let ctx = context_with_provider(
1687 meta.id,
1688 runner.manager_ctx.clone() as Arc<dyn ContextProvider>,
1689 );
1690 runner
1692 .manager_ctx
1693 .procedures
1694 .write()
1695 .unwrap()
1696 .insert(meta.id, runner.meta.clone());
1697
1698 runner.manager_ctx.start();
1699 runner.execute_once(&ctx).await;
1700 let state = runner.meta.state();
1701 assert!(state.is_running(), "{state:?}");
1702
1703 let procedure_id = runner
1704 .manager_ctx
1705 .poison_manager
1706 .get_poison(&poison_key.to_string())
1707 .await
1708 .unwrap();
1709 assert!(procedure_id.is_some());
1711
1712 runner.execute_once(&ctx).await;
1713 let state = runner.meta.state();
1714 assert!(state.is_running(), "{state:?}");
1715
1716 let procedure_id = runner
1717 .manager_ctx
1718 .poison_manager
1719 .get_poison(&poison_key.to_string())
1720 .await
1721 .unwrap();
1722 assert!(procedure_id.is_none());
1724 }
1725
1726 #[tokio::test]
1727 async fn test_execute_error_with_clean_poisons() {
1728 common_telemetry::init_default_ut_logging();
1729 let mut times = 0;
1730 let poison_key = PoisonKey::new("table/1024");
1731 let moved_poison_key = poison_key.clone();
1732 let exec_fn = move |ctx: Context| {
1733 times += 1;
1734 let poison_key = moved_poison_key.clone();
1735 async move {
1736 if times == 1 {
1737 ctx.provider
1739 .try_put_poison(&poison_key, ctx.procedure_id)
1740 .await
1741 .unwrap();
1742
1743 Ok(Status::executing(true))
1744 } else {
1745 Err(Error::external_and_clean_poisons(MockError::new(
1746 StatusCode::Unexpected,
1747 )))
1748 }
1749 }
1750 .boxed()
1751 };
1752 let poison = ProcedureAdapter {
1753 data: "poison".to_string(),
1754 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1755 poison_keys: PoisonKeys::new(vec![poison_key.clone()]),
1756 exec_fn,
1757 rollback_fn: None,
1758 };
1759
1760 let dir = create_temp_dir("error_with_clean_poisons");
1761 let meta = poison.new_meta(ROOT_ID);
1762
1763 let object_store = test_util::new_object_store(&dir);
1764 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1765 let mut runner = new_runner(meta.clone(), Box::new(poison), procedure_store.clone());
1766
1767 let ctx = context_with_provider(
1769 meta.id,
1770 runner.manager_ctx.clone() as Arc<dyn ContextProvider>,
1771 );
1772 runner
1774 .manager_ctx
1775 .procedures
1776 .write()
1777 .unwrap()
1778 .insert(meta.id, runner.meta.clone());
1779
1780 runner.manager_ctx.start();
1781 runner.execute_once(&ctx).await;
1782 let state = runner.meta.state();
1783 assert!(state.is_running(), "{state:?}");
1784
1785 let procedure_id = runner
1786 .manager_ctx
1787 .poison_manager
1788 .get_poison(&poison_key.to_string())
1789 .await
1790 .unwrap();
1791 assert!(procedure_id.is_some());
1793
1794 runner.execute_once(&ctx).await;
1795 let state = runner.meta.state();
1796 assert!(state.is_failed(), "{state:?}");
1797
1798 let procedure_id = runner
1799 .manager_ctx
1800 .poison_manager
1801 .get_poison(&poison_key.to_string())
1802 .await
1803 .unwrap();
1804 assert!(procedure_id.is_none());
1806 }
1807
1808 #[tokio::test]
1809 async fn test_execute_failed_after_set_poison() {
1810 let mut times = 0;
1811 let poison_key = PoisonKey::new("table/1024");
1812 let moved_poison_key = poison_key.clone();
1813 let exec_fn = move |ctx: Context| {
1814 times += 1;
1815 let poison_key = moved_poison_key.clone();
1816 async move {
1817 if times == 1 {
1818 Ok(Status::executing(true))
1819 } else {
1820 ctx.provider
1822 .try_put_poison(&poison_key, ctx.procedure_id)
1823 .await
1824 .unwrap();
1825 Err(Error::external(MockError::new(StatusCode::Unexpected)))
1826 }
1827 }
1828 .boxed()
1829 };
1830 let poison = ProcedureAdapter {
1831 data: "poison".to_string(),
1832 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1833 poison_keys: PoisonKeys::new(vec![poison_key.clone()]),
1834 exec_fn,
1835 rollback_fn: None,
1836 };
1837
1838 let dir = create_temp_dir("poison");
1839 let meta = poison.new_meta(ROOT_ID);
1840
1841 let object_store = test_util::new_object_store(&dir);
1842 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1843 let mut runner = new_runner(meta.clone(), Box::new(poison), procedure_store.clone());
1844
1845 let ctx = context_with_provider(
1847 meta.id,
1848 runner.manager_ctx.clone() as Arc<dyn ContextProvider>,
1849 );
1850 runner
1852 .manager_ctx
1853 .procedures
1854 .write()
1855 .unwrap()
1856 .insert(meta.id, runner.meta.clone());
1857
1858 runner.manager_ctx.start();
1859 runner.execute_once(&ctx).await;
1860 let state = runner.meta.state();
1861 assert!(state.is_running(), "{state:?}");
1862
1863 runner.execute_once(&ctx).await;
1864 let state = runner.meta.state();
1865 assert!(state.is_failed(), "{state:?}");
1866 assert!(meta.state().is_failed());
1867
1868 let procedure_id = runner
1870 .manager_ctx
1871 .poison_manager
1872 .get_poison(&poison_key.to_string())
1873 .await
1874 .unwrap()
1875 .unwrap();
1876
1877 assert_eq!(&procedure_id.clone(), ROOT_ID);
1879 }
1880
1881 #[tokio::test]
1882 async fn test_execute_exceed_max_retry_after_set_poison() {
1883 common_telemetry::init_default_ut_logging();
1884 let mut times = 0;
1885 let poison_key = PoisonKey::new("table/1024");
1886 let moved_poison_key = poison_key.clone();
1887 let exec_fn = move |ctx: Context| {
1888 times += 1;
1889 let poison_key = moved_poison_key.clone();
1890 async move {
1891 if times == 1 {
1892 Ok(Status::executing(true))
1893 } else {
1894 ctx.provider
1896 .try_put_poison(&poison_key, ctx.procedure_id)
1897 .await
1898 .unwrap();
1899 Err(Error::retry_later_and_clean_poisons(MockError::new(
1900 StatusCode::Unexpected,
1901 )))
1902 }
1903 }
1904 .boxed()
1905 };
1906 let poison = ProcedureAdapter {
1907 data: "poison".to_string(),
1908 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1909 poison_keys: PoisonKeys::new(vec![poison_key.clone()]),
1910 exec_fn,
1911 rollback_fn: None,
1912 };
1913
1914 let dir = create_temp_dir("exceed_max_after_set_poison");
1915 let meta = poison.new_meta(ROOT_ID);
1916 let object_store = test_util::new_object_store(&dir);
1917 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1918 let mut runner = new_runner(meta.clone(), Box::new(poison), procedure_store);
1919 runner.manager_ctx.start();
1920 runner.exponential_builder = ExponentialBuilder::default()
1921 .with_min_delay(Duration::from_millis(1))
1922 .with_max_times(3);
1923 let ctx = context_with_provider(
1925 meta.id,
1926 runner.manager_ctx.clone() as Arc<dyn ContextProvider>,
1927 );
1928 runner
1930 .manager_ctx
1931 .procedures
1932 .write()
1933 .unwrap()
1934 .insert(meta.id, runner.meta.clone());
1935 runner.execute_once_with_retry(&ctx).await;
1937 let err = meta.state().error().unwrap().clone();
1938 assert_matches!(&*err, Error::RetryTimesExceeded { .. });
1939
1940 let procedure_id = runner
1942 .manager_ctx
1943 .poison_manager
1944 .get_poison(&poison_key.to_string())
1945 .await
1946 .unwrap();
1947 assert_eq!(procedure_id, None);
1948 }
1949
1950 #[tokio::test]
1951 async fn test_execute_poisoned() {
1952 let mut times = 0;
1953 let poison_key = PoisonKey::new("table/1024");
1954 let moved_poison_key = poison_key.clone();
1955 let exec_fn = move |ctx: Context| {
1956 times += 1;
1957 let poison_key = moved_poison_key.clone();
1958 async move {
1959 if times == 1 {
1960 Ok(Status::executing(true))
1961 } else {
1962 ctx.provider
1964 .try_put_poison(&poison_key, ctx.procedure_id)
1965 .await
1966 .unwrap();
1967 Ok(Status::Poisoned {
1968 keys: PoisonKeys::new(vec![poison_key.clone()]),
1969 error: Error::external(MockError::new(StatusCode::Unexpected)),
1970 })
1971 }
1972 }
1973 .boxed()
1974 };
1975 let poison = ProcedureAdapter {
1976 data: "poison".to_string(),
1977 lock_key: LockKey::single_exclusive("catalog.schema.table"),
1978 poison_keys: PoisonKeys::new(vec![poison_key.clone()]),
1979 exec_fn,
1980 rollback_fn: None,
1981 };
1982
1983 let dir = create_temp_dir("poison");
1984 let meta = poison.new_meta(ROOT_ID);
1985
1986 let object_store = test_util::new_object_store(&dir);
1987 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1988 let mut runner = new_runner(meta.clone(), Box::new(poison), procedure_store.clone());
1989
1990 let ctx = context_with_provider(
1992 meta.id,
1993 runner.manager_ctx.clone() as Arc<dyn ContextProvider>,
1994 );
1995 runner
1997 .manager_ctx
1998 .procedures
1999 .write()
2000 .unwrap()
2001 .insert(meta.id, runner.meta.clone());
2002
2003 runner.manager_ctx.start();
2004 runner.execute_once(&ctx).await;
2005 let state = runner.meta.state();
2006 assert!(state.is_running(), "{state:?}");
2007
2008 runner.execute_once(&ctx).await;
2009 let state = runner.meta.state();
2010 assert!(state.is_poisoned(), "{state:?}");
2011 assert!(meta.state().is_poisoned());
2012 check_files(
2013 &object_store,
2014 &procedure_store,
2015 ctx.procedure_id,
2016 &["0000000000.step"],
2017 )
2018 .await;
2019
2020 let procedure_id = runner
2022 .manager_ctx
2023 .poison_manager
2024 .get_poison(&poison_key.to_string())
2025 .await
2026 .unwrap()
2027 .unwrap();
2028
2029 assert_eq!(procedure_id, ROOT_ID);
2031 }
2032
2033 fn test_procedure_with_dynamic_lock(
2034 shared_atomic_value: Arc<AtomicU64>,
2035 id: u64,
2036 ) -> (BoxedProcedure, Arc<ProcedureMeta>) {
2037 let exec_fn = move |ctx: Context| {
2038 let moved_shared_atomic_value = shared_atomic_value.clone();
2039 let moved_ctx = ctx.clone();
2040 async move {
2041 debug!("Acquiring write lock, id: {}", id);
2042 let key = StringKey::Exclusive("test_lock".to_string());
2043 let guard = moved_ctx.provider.acquire_lock(&key).await;
2044 debug!("Acquired write lock, id: {}", id);
2045 let millis = rand::rng().random_range(10..=50);
2046 tokio::time::sleep(Duration::from_millis(millis)).await;
2047 let value = moved_shared_atomic_value.load(Ordering::Relaxed);
2048 moved_shared_atomic_value.store(value + 1, Ordering::Relaxed);
2049 debug!("Dropping write lock, id: {}", id);
2050 drop(guard);
2051
2052 Ok(Status::done())
2053 }
2054 .boxed()
2055 };
2056
2057 let adapter = ProcedureAdapter {
2058 data: "dynamic_lock".to_string(),
2059 lock_key: LockKey::new_exclusive([]),
2060 poison_keys: PoisonKeys::new([]),
2061 exec_fn,
2062 rollback_fn: None,
2063 };
2064 let meta = adapter.new_meta(ROOT_ID);
2065
2066 (Box::new(adapter), meta)
2067 }
2068
2069 #[tokio::test(flavor = "multi_thread")]
2070 async fn test_execute_with_dynamic_lock() {
2071 common_telemetry::init_default_ut_logging();
2072 let shared_atomic_value = Arc::new(AtomicU64::new(0));
2073 let (procedure1, meta1) = test_procedure_with_dynamic_lock(shared_atomic_value.clone(), 1);
2074 let (procedure2, meta2) = test_procedure_with_dynamic_lock(shared_atomic_value.clone(), 2);
2075
2076 let dir = create_temp_dir("dynamic_lock");
2077 let object_store = test_util::new_object_store(&dir);
2078 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
2079 let mut runner1 = new_runner(meta1.clone(), procedure1, procedure_store.clone());
2080 let mut runner2 = new_runner(meta2.clone(), procedure2, procedure_store.clone());
2081 let ctx1 = context_with_provider(
2082 meta1.id,
2083 runner1.manager_ctx.clone() as Arc<dyn ContextProvider>,
2084 );
2085 let ctx2 = context_with_provider(
2086 meta2.id,
2087 runner1.manager_ctx.clone() as Arc<dyn ContextProvider>,
2089 );
2090 let tasks = [runner1.execute_once(&ctx1), runner2.execute_once(&ctx2)];
2091 join_all(tasks).await;
2092 assert_eq!(shared_atomic_value.load(Ordering::Relaxed), 2);
2093 }
2094 #[tokio::test]
2095 async fn test_on_suspend_deadlock_detected_no_rollback() {
2096 let child_id = ProcedureId::random();
2099 let exec_fn = move |_| {
2100 async move {
2101 let child_exec_fn = |_| async { Ok(Status::done()) }.boxed();
2102 let child = ProcedureAdapter {
2103 data: "child".to_string(),
2104 lock_key: LockKey::single_exclusive("catalog.schema.table"),
2105 poison_keys: PoisonKeys::default(),
2106 exec_fn: child_exec_fn,
2107 rollback_fn: None,
2108 };
2109 Ok(Status::Suspended {
2110 subprocedures: vec![ProcedureWithId {
2111 id: child_id,
2112 procedure: Box::new(child),
2113 }],
2114 persist: false,
2115 })
2116 }
2117 .boxed()
2118 };
2119 let parent = ProcedureAdapter {
2120 data: "parent".to_string(),
2121 lock_key: LockKey::single_exclusive("catalog.schema.table"),
2122 poison_keys: PoisonKeys::default(),
2123 exec_fn,
2124 rollback_fn: None, };
2126
2127 let dir = create_temp_dir("deadlock_no_rollback");
2128 let meta = parent.new_meta(ROOT_ID);
2129 let ctx = context_without_provider(meta.id);
2130 let object_store = test_util::new_object_store(&dir);
2131 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
2132 let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store);
2133 runner.manager_ctx.start();
2134
2135 runner.execute_once(&ctx).await;
2136 let state = runner.meta.state();
2137 assert!(state.is_failed(), "Expected Failed, got {state:?}");
2138 assert!(
2140 state.error().is_some(),
2141 "Failed state should contain an error"
2142 );
2143 assert!(
2145 !runner.manager_ctx.contains_procedure(child_id),
2146 "Child procedure should not be submitted when deadlock is detected"
2147 );
2148 }
2149
2150 #[tokio::test]
2151 async fn test_on_suspend_deadlock_detected_with_rollback() {
2152 let child_id = ProcedureId::random();
2155 let exec_fn = move |_| {
2156 async move {
2157 let child_exec_fn = |_| async { Ok(Status::done()) }.boxed();
2158 let child = ProcedureAdapter {
2159 data: "child".to_string(),
2160 lock_key: LockKey::single_exclusive("catalog.schema.table"),
2161 poison_keys: PoisonKeys::default(),
2162 exec_fn: child_exec_fn,
2163 rollback_fn: None,
2164 };
2165 Ok(Status::Suspended {
2166 subprocedures: vec![ProcedureWithId {
2167 id: child_id,
2168 procedure: Box::new(child),
2169 }],
2170 persist: false,
2171 })
2172 }
2173 .boxed()
2174 };
2175 let rollback_fn = move |_| async move { Ok(()) }.boxed();
2176 let parent = ProcedureAdapter {
2177 data: "parent".to_string(),
2178 lock_key: LockKey::single_exclusive("catalog.schema.table"),
2179 poison_keys: PoisonKeys::default(),
2180 exec_fn,
2181 rollback_fn: Some(Box::new(rollback_fn)), };
2183
2184 let dir = create_temp_dir("deadlock_with_rollback");
2185 let meta = parent.new_meta(ROOT_ID);
2186 let ctx = context_without_provider(meta.id);
2187 let object_store = test_util::new_object_store(&dir);
2188 let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
2189 let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store);
2190 runner.manager_ctx.start();
2191
2192 runner.execute_once(&ctx).await;
2193 let state = runner.meta.state();
2194 assert!(
2195 state.is_prepare_rollback(),
2196 "Expected PrepareRollback, got {state:?}"
2197 );
2198 match &state {
2200 ProcedureState::PrepareRollback { error } => {
2201 assert!(!error.to_string().is_empty(), "Error should not be empty");
2202 }
2203 _ => panic!("Expected PrepareRollback, got {state:?}"),
2204 }
2205 assert!(
2207 !runner.manager_ctx.contains_procedure(child_id),
2208 "Child procedure should not be submitted when deadlock is detected"
2209 );
2210 }
2211
2212 #[test]
2213 fn test_find_lock_conflicts() {
2214 use crate::procedure::StringKey;
2215
2216 let parent = [StringKey::Share("A".to_string())];
2218 let child = [StringKey::Share("A".to_string())];
2219 assert!(super::find_lock_conflicts(parent.iter(), child.iter()).is_empty());
2220
2221 let parent = [StringKey::Share("A".to_string())];
2223 let child = [StringKey::Exclusive("A".to_string())];
2224 assert_eq!(
2225 super::find_lock_conflicts(parent.iter(), child.iter()),
2226 vec!["A".to_string()]
2227 );
2228
2229 let parent = [StringKey::Exclusive("A".to_string())];
2231 let child = [StringKey::Share("A".to_string())];
2232 assert_eq!(
2233 super::find_lock_conflicts(parent.iter(), child.iter()),
2234 vec!["A".to_string()]
2235 );
2236
2237 let parent = [StringKey::Exclusive("A".to_string())];
2239 let child = [StringKey::Exclusive("A".to_string())];
2240 assert_eq!(
2241 super::find_lock_conflicts(parent.iter(), child.iter()),
2242 vec!["A".to_string()]
2243 );
2244
2245 let parent = [
2247 StringKey::Share("A".to_string()),
2248 StringKey::Exclusive("B".to_string()),
2249 ];
2250 let child = [
2251 StringKey::Exclusive("A".to_string()), StringKey::Share("B".to_string()), StringKey::Exclusive("C".to_string()), ];
2255 let mut conflicts = super::find_lock_conflicts(parent.iter(), child.iter());
2256 conflicts.sort();
2257 assert_eq!(conflicts, vec!["A".to_string(), "B".to_string()]);
2258 }
2259}