Skip to main content

common_procedure/local/
runner.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::ops::Add;
16use std::sync::Arc;
17use std::time::Duration;
18
19use backon::{BackoffBuilder, ExponentialBuilder};
20use common_error::ext::PlainError;
21use common_error::status_code::StatusCode;
22use common_event_recorder::EventRecorderRef;
23use common_telemetry::tracing::warn;
24use common_telemetry::tracing_context::{FutureExt, TracingContext};
25use common_telemetry::{debug, error, info, tracing};
26use rand::Rng;
27use snafu::ResultExt;
28use tokio::time;
29
30use crate::error::{self, ProcedurePanicSnafu, Result, RollbackTimesExceededSnafu};
31use crate::local::{ManagerContext, ProcedureMeta, ProcedureMetaRef};
32use crate::procedure::{Output, StringKey};
33use crate::rwlock::OwnedKeyRwLockGuard;
34use crate::store::{ProcedureMessage, ProcedureStore};
35use crate::{
36    BoxedProcedure, Context, Error, Procedure, ProcedureId, ProcedureState, ProcedureWithId, Status,
37};
38
39/// A guard to cleanup procedure state.
40struct ProcedureGuard {
41    meta: ProcedureMetaRef,
42    manager_ctx: Arc<ManagerContext>,
43    key_guards: Vec<OwnedKeyRwLockGuard>,
44    finish: bool,
45}
46
47impl ProcedureGuard {
48    /// Returns a new [ProcedureGuard].
49    fn new(meta: ProcedureMetaRef, manager_ctx: Arc<ManagerContext>) -> ProcedureGuard {
50        ProcedureGuard {
51            meta,
52            manager_ctx,
53            key_guards: vec![],
54            finish: false,
55        }
56    }
57
58    /// The procedure is finished successfully.
59    fn finish(mut self) {
60        self.finish = true;
61    }
62}
63
64impl Drop for ProcedureGuard {
65    fn drop(&mut self) {
66        if !self.finish {
67            error!("Procedure {} exits unexpectedly", self.meta.id);
68
69            // Set state to failed. This is useful in test as runtime may not abort when the runner task panics.
70            // See https://github.com/tokio-rs/tokio/issues/2002 .
71            // We set set_panic_hook() in the application's main function. But our tests don't have this panic hook.
72            let err = ProcedurePanicSnafu {
73                procedure_id: self.meta.id,
74            }
75            .build();
76            self.meta.set_state(ProcedureState::failed(Arc::new(err)));
77        }
78
79        // Notify parent procedure.
80        if let Some(parent_id) = self.meta.parent_id {
81            self.manager_ctx.notify_by_subprocedure(parent_id);
82        }
83
84        // Drops the key guards in the reverse order.
85        while !self.key_guards.is_empty() {
86            self.key_guards.pop();
87        }
88
89        // Clean the staled locks.
90        self.manager_ctx
91            .key_lock
92            .clean_keys(self.meta.lock_key.keys_to_lock().map(|k| k.as_string()));
93    }
94}
95
96/// Returns a list of conflicting lock keys between a parent and a child procedure.
97/// Evaluates the Read/Write lock compatibility matrix:
98/// - Share + Share => Compatible
99/// - Exclusive + Any => Conflict
100/// - Any + Exclusive => Conflict
101fn find_lock_conflicts<'a>(
102    parent_keys: impl Iterator<Item = &'a StringKey>,
103    child_keys: impl Iterator<Item = &'a StringKey>,
104) -> Vec<String> {
105    use std::collections::HashMap;
106
107    // Map from key string slice (&str) to a boolean indicating if the parent holds it EXCLUSIVELY.
108    let mut parent_map = HashMap::new();
109    for key in parent_keys {
110        match key {
111            StringKey::Exclusive(k) => {
112                parent_map.insert(k.as_str(), true);
113            }
114            StringKey::Share(k) => {
115                parent_map.entry(k.as_str()).or_insert(false);
116            }
117        }
118    }
119
120    child_keys
121        .filter_map(|child_key| match child_key {
122            StringKey::Exclusive(k) | StringKey::Share(k)
123                if parent_map.get(k.as_str()) == Some(&true) =>
124            {
125                Some(k.clone())
126            }
127            StringKey::Exclusive(k) if parent_map.get(k.as_str()) == Some(&false) => {
128                Some(k.clone())
129            }
130            _ => None,
131        })
132        .collect()
133}
134
135pub(crate) struct Runner {
136    pub(crate) meta: ProcedureMetaRef,
137    pub(crate) procedure: BoxedProcedure,
138    pub(crate) manager_ctx: Arc<ManagerContext>,
139    pub(crate) step: u32,
140    pub(crate) exponential_builder: ExponentialBuilder,
141    pub(crate) store: Arc<ProcedureStore>,
142    pub(crate) rolling_back: bool,
143    pub(crate) event_recorder: Option<EventRecorderRef>,
144}
145
146impl Runner {
147    /// Return `ProcedureManager` is running.
148    pub(crate) fn running(&self) -> bool {
149        self.manager_ctx.running()
150    }
151
152    /// Run the procedure.
153    pub(crate) async fn run(mut self) {
154        // Ensure we can update the procedure state.
155        let mut guard = ProcedureGuard::new(self.meta.clone(), self.manager_ctx.clone());
156
157        info!(
158            "Runner {}-{} starts",
159            self.procedure.type_name(),
160            self.meta.id
161        );
162
163        // TODO(yingwen): Detect recursive locking (and deadlock) if possible. Maybe we could detect
164        // recursive locking by adding a root procedure id to the meta.
165        for key in self.meta.lock_key.keys_to_lock() {
166            // Acquire lock for each key.
167            let key_guard = match key {
168                StringKey::Share(key) => self.manager_ctx.key_lock.read(key.clone()).await.into(),
169                StringKey::Exclusive(key) => {
170                    self.manager_ctx.key_lock.write(key.clone()).await.into()
171                }
172            };
173
174            guard.key_guards.push(key_guard);
175        }
176
177        // Execute the procedure. We need to release the lock whenever the execution
178        // is successful or fail.
179        self.meta.set_start_time_ms();
180        self.execute_procedure_in_loop().await;
181        self.meta.set_end_time_ms();
182
183        // We can't remove the metadata of the procedure now as users and its parent might
184        // need to query its state.
185        // TODO(yingwen): 1. Add TTL to the metadata; 2. Only keep state in the procedure store
186        // so we don't need to always store the metadata in memory after the procedure is done.
187
188        // Release locks and notify parent procedure.
189        guard.finish();
190
191        // If this is the root procedure, clean up message cache.
192        if self.meta.parent_id.is_none() {
193            let procedure_ids = self.manager_ctx.procedures_in_tree(&self.meta);
194            // Clean resources.
195            self.manager_ctx.on_procedures_finish(&procedure_ids);
196
197            // If `ProcedureManager` is stopped, it stops the current task immediately without deleting the procedure.
198            if !self.running() {
199                return;
200            }
201
202            for id in procedure_ids {
203                if let Err(e) = self.store.delete_procedure(id).await {
204                    error!(
205                        e;
206                        "Runner {}-{} failed to delete procedure {}",
207                        self.procedure.type_name(),
208                        self.meta.id,
209                        id,
210                    );
211                }
212            }
213        }
214
215        info!(
216            "Runner {}-{} exits",
217            self.procedure.type_name(),
218            self.meta.id
219        );
220    }
221
222    async fn execute_procedure_in_loop(&mut self) {
223        let ctx = Context {
224            procedure_id: self.meta.id,
225            provider: self.manager_ctx.clone(),
226        };
227
228        self.rolling_back = false;
229        self.execute_once_with_retry(&ctx).await;
230    }
231
232    async fn execute_once_with_retry(&mut self, ctx: &Context) {
233        let mut retry = self.exponential_builder.build();
234        let mut retry_times = 0;
235
236        let mut rollback = self.exponential_builder.build();
237        let mut rollback_times = 0;
238
239        loop {
240            // Don't store state if `ProcedureManager` is stopped.
241            if !self.running() {
242                self.meta.set_state(ProcedureState::failed(Arc::new(
243                    error::ManagerNotStartSnafu {}.build(),
244                )));
245                return;
246            }
247            let state = self.meta.state();
248            match state {
249                ProcedureState::Running => {}
250                ProcedureState::Retrying { error } => {
251                    retry_times += 1;
252                    if let Some(d) = retry.next() {
253                        let millis = d.as_millis() as u64;
254                        // Add random noise to the retry delay to avoid retry storms.
255                        let noise = rand::rng().random_range(0..(millis / 4) + 1);
256                        let d = d.add(Duration::from_millis(noise));
257
258                        self.wait_on_err(d, retry_times).await;
259                    } else {
260                        self.meta
261                            .set_state(ProcedureState::prepare_rollback(Arc::new(
262                                Error::RetryTimesExceeded {
263                                    source: error.clone(),
264                                    procedure_id: self.meta.id,
265                                },
266                            )));
267                    }
268                }
269                ProcedureState::PrepareRollback { error }
270                | ProcedureState::RollingBack { error } => {
271                    rollback_times += 1;
272                    if let Some(d) = rollback.next() {
273                        self.wait_on_err(d, rollback_times).await;
274                    } else {
275                        let err = Err::<(), Arc<Error>>(error)
276                            .context(RollbackTimesExceededSnafu {
277                                procedure_id: self.meta.id,
278                            })
279                            .unwrap_err();
280                        self.meta.set_state(ProcedureState::failed(Arc::new(err)));
281                        return;
282                    }
283                }
284                ProcedureState::Done { .. } => return,
285                ProcedureState::Failed { .. } => return,
286                ProcedureState::Poisoned { .. } => return,
287            }
288            self.execute_once(ctx).await;
289        }
290    }
291
292    async fn clean_poisons(&mut self) -> Result<()> {
293        let mut error = None;
294        for key in self.meta.poison_keys.iter() {
295            let key = key.to_string();
296            if let Err(e) = self
297                .manager_ctx
298                .poison_manager
299                .delete_poison(key, self.meta.id.to_string())
300                .await
301            {
302                error!(e; "Failed to clean poisons for procedure: {}", self.meta.id);
303                error = Some(e);
304            }
305        }
306
307        // returns the last error if any.
308        if let Some(e) = error {
309            return Err(e);
310        }
311        Ok(())
312    }
313
314    async fn rollback(&mut self, ctx: &Context, err: Arc<Error>) {
315        if self.procedure.rollback_supported()
316            && let Err(e) = self.procedure.rollback(ctx).await
317        {
318            self.meta
319                .set_state(ProcedureState::rolling_back(Arc::new(e)));
320            return;
321        }
322        self.meta.set_state(ProcedureState::failed(err));
323    }
324
325    async fn prepare_rollback(&mut self, err: Arc<Error>) {
326        if let Err(e) = self.write_rollback_procedure_state(err.to_string()).await {
327            self.meta
328                .set_state(ProcedureState::prepare_rollback(Arc::new(e)));
329            return;
330        }
331        if self.procedure.rollback_supported() {
332            self.meta.set_state(ProcedureState::rolling_back(err));
333        } else {
334            self.meta.set_state(ProcedureState::failed(err));
335        }
336    }
337
338    async fn execute_once(&mut self, ctx: &Context) {
339        match self.meta.state() {
340            ProcedureState::Running | ProcedureState::Retrying { .. } => {
341                match self.procedure.execute(ctx).await {
342                    Ok(status) => {
343                        debug!(
344                            "Execute procedure {}-{} once, status: {:?}, need_persist: {}",
345                            self.procedure.type_name(),
346                            self.meta.id,
347                            status,
348                            status.need_persist(),
349                        );
350
351                        // Don't store state if `ProcedureManager` is stopped.
352                        if !self.running() {
353                            self.meta.set_state(ProcedureState::failed(Arc::new(
354                                error::ManagerNotStartSnafu {}.build(),
355                            )));
356                            return;
357                        }
358
359                        // Cleans poisons before persist.
360                        if status.need_clean_poisons()
361                            && let Err(e) = self.clean_poisons().await
362                        {
363                            error!(e; "Failed to clean poison for procedure: {}", self.meta.id);
364                            self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
365                            return;
366                        }
367
368                        if status.need_persist()
369                            && let Err(e) = self.persist_procedure().await
370                        {
371                            error!(e; "Failed to persist procedure: {}", self.meta.id);
372                            self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
373                            return;
374                        }
375
376                        match status {
377                            Status::Executing { .. } => {
378                                let prev_state = self.meta.state();
379                                if !matches!(prev_state, ProcedureState::Running) {
380                                    info!(
381                                        "Set Procedure {}-{} state to running, prev_state: {:?}",
382                                        self.procedure.type_name(),
383                                        self.meta.id,
384                                        prev_state
385                                    );
386                                    self.meta.set_state(ProcedureState::Running);
387                                }
388                            }
389                            Status::Suspended { subprocedures, .. } => {
390                                let prev_state = self.meta.state();
391                                if !matches!(prev_state, ProcedureState::Running) {
392                                    info!(
393                                        "Set Procedure {}-{} state to running, prev_state: {:?}",
394                                        self.procedure.type_name(),
395                                        self.meta.id,
396                                        prev_state
397                                    );
398                                    self.meta.set_state(ProcedureState::Running);
399                                }
400                                self.on_suspended(subprocedures).await;
401                            }
402                            Status::Done { output } => {
403                                if let Err(e) = self.commit_procedure().await {
404                                    error!(e; "Failed to commit procedure: {}", self.meta.id);
405                                    self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
406                                    return;
407                                }
408
409                                self.done(output);
410                            }
411                            Status::Poisoned { error, keys } => {
412                                error!(
413                                    error;
414                                    "Procedure {}-{} is poisoned, keys: {:?}",
415                                    self.procedure.type_name(),
416                                    self.meta.id,
417                                    keys,
418                                );
419                                self.meta
420                                    .set_state(ProcedureState::poisoned(keys, Arc::new(error)));
421                            }
422                        }
423                    }
424                    Err(e) => {
425                        error!(
426                            e;
427                            "Failed to execute procedure {}-{}, retry: {}, clean_poisons: {}",
428                            self.procedure.type_name(),
429                            self.meta.id,
430                            e.is_retry_later(),
431                            e.need_clean_poisons(),
432                        );
433
434                        // Don't store state if `ProcedureManager` is stopped.
435                        if !self.running() {
436                            self.meta.set_state(ProcedureState::failed(Arc::new(
437                                error::ManagerNotStartSnafu {}.build(),
438                            )));
439                            return;
440                        }
441
442                        if e.need_clean_poisons() {
443                            if let Err(e) = self.clean_poisons().await {
444                                error!(e; "Failed to clean poison for procedure: {}", self.meta.id);
445                                self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
446                                return;
447                            }
448                            debug!(
449                                "Procedure {}-{} cleaned poisons",
450                                self.procedure.type_name(),
451                                self.meta.id,
452                            );
453                        }
454
455                        if e.is_retry_later() {
456                            self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
457                            return;
458                        }
459
460                        if self.procedure.rollback_supported() {
461                            self.meta
462                                .set_state(ProcedureState::prepare_rollback(Arc::new(e)));
463                        } else {
464                            self.meta.set_state(ProcedureState::failed(Arc::new(e)));
465                        }
466                    }
467                }
468            }
469            ProcedureState::PrepareRollback { error } => self.prepare_rollback(error).await,
470            ProcedureState::RollingBack { error } => self.rollback(ctx, error).await,
471            ProcedureState::Failed { .. }
472            | ProcedureState::Done { .. }
473            | ProcedureState::Poisoned { .. } => (),
474        }
475    }
476
477    /// Submit a subprocedure with specific `procedure_id`.
478    fn submit_subprocedure(
479        &self,
480        procedure_id: ProcedureId,
481        procedure_state: ProcedureState,
482        procedure: BoxedProcedure,
483    ) {
484        if !self.running() {
485            warn!(
486                "ProcedureManager is not running, skip submitting subprocedure {}-{}",
487                procedure.type_name(),
488                procedure_id
489            );
490            return;
491        }
492
493        if self.manager_ctx.contains_procedure(procedure_id) {
494            // If the parent has already submitted this procedure, don't submit it again.
495            return;
496        }
497
498        let step = 0;
499
500        let meta = Arc::new(ProcedureMeta::new(
501            procedure_id,
502            procedure_state,
503            Some(self.meta.id),
504            procedure.lock_key(),
505            procedure.poison_keys(),
506            procedure.type_name(),
507            self.event_recorder.clone(),
508            procedure.user_metadata(),
509        ));
510        let runner = Runner {
511            meta: meta.clone(),
512            procedure,
513            manager_ctx: self.manager_ctx.clone(),
514            step,
515            exponential_builder: self.exponential_builder,
516            store: self.store.clone(),
517            rolling_back: false,
518            event_recorder: self.event_recorder.clone(),
519        };
520
521        // Insert the procedure. We already check the procedure existence before inserting
522        // so we add an assertion to ensure the procedure id is unique and no other procedures
523        // using the same procedure id.
524        assert!(
525            self.manager_ctx.try_insert_procedure(meta),
526            "Procedure {}-{} submit an existing procedure {}-{}",
527            self.procedure.type_name(),
528            self.meta.id,
529            runner.procedure.type_name(),
530            procedure_id,
531        );
532
533        let parent_id = self.meta.id;
534
535        let tracing_context = TracingContext::from_current_span();
536        if !self.manager_ctx.spawn_runner_task(procedure_id, || {
537            common_runtime::spawn_global(async move {
538                let span = tracing_context.attach(tracing::info_span!(
539                    "LocalManager::submit_subprocedure",
540                    procedure_name = %runner.meta.type_name,
541                    procedure_id = %runner.meta.id,
542                    parent_id = %parent_id,
543                ));
544                // Run the root procedure.
545                // The task was moved to another runtime for execution.
546                // In order not to interrupt tracing, a span needs to be created to continue tracing the current task.
547                runner.run().trace(span).await
548            })
549        }) {
550            self.manager_ctx.remove_procedure(procedure_id);
551            return;
552        }
553
554        // Add the id of the subprocedure to the metadata.
555        self.meta.push_child(procedure_id);
556    }
557
558    /// Extend the retry time to wait for the next retry.
559    async fn wait_on_err(&mut self, d: Duration, i: u64) {
560        info!(
561            "Procedure {}-{} retry for the {} times after {} millis",
562            self.procedure.type_name(),
563            self.meta.id,
564            i,
565            d.as_millis(),
566        );
567        time::sleep(d).await;
568    }
569
570    async fn on_suspended(&mut self, subprocedures: Vec<ProcedureWithId>) {
571        let has_child = !subprocedures.is_empty();
572
573        // Pre-check: detect potential deadlocks BEFORE submitting any subprocedure.
574        // If a child shares conflicting lock keys with the parent, submitting it would
575        // cause a Hold-and-Wait deadlock — the child blocks on lock acquisition while
576        // the parent holds the lock and waits for the child to finish.
577        for sub in &subprocedures {
578            let conflicting = find_lock_conflicts(
579                self.meta.lock_key.keys_to_lock(),
580                sub.procedure.lock_key().keys_to_lock(),
581            );
582            if !conflicting.is_empty() {
583                let err_msg = format!(
584                    "Deadlock prevented: subprocedure {}-{} shares conflicting lock key(s) {:?} \
585                     with parent {}-{}. Parent holds these locks and would wait for child \
586                     completion, but child cannot acquire them.",
587                    sub.procedure.type_name(),
588                    sub.id,
589                    conflicting,
590                    self.procedure.type_name(),
591                    self.meta.id,
592                );
593                error!("{}", err_msg);
594                let err = Arc::new(Error::external(PlainError::new(
595                    err_msg,
596                    StatusCode::Internal,
597                )));
598                if self.procedure.rollback_supported() {
599                    self.meta.set_state(ProcedureState::prepare_rollback(err));
600                } else {
601                    self.meta.set_state(ProcedureState::failed(err));
602                }
603                return;
604            }
605        }
606
607        for subprocedure in subprocedures {
608            info!(
609                "Procedure {}-{} submit subprocedure {}-{}",
610                self.procedure.type_name(),
611                self.meta.id,
612                subprocedure.procedure.type_name(),
613                subprocedure.id,
614            );
615
616            self.submit_subprocedure(
617                subprocedure.id,
618                ProcedureState::Running,
619                subprocedure.procedure,
620            );
621        }
622
623        info!(
624            "Procedure {}-{} is waiting for subprocedures",
625            self.procedure.type_name(),
626            self.meta.id,
627        );
628
629        // Wait for subprocedures.
630        if has_child {
631            self.meta.child_notify.notified().await;
632
633            info!(
634                "Procedure {}-{} is waked up",
635                self.procedure.type_name(),
636                self.meta.id,
637            );
638        }
639    }
640
641    async fn persist_procedure(&mut self) -> Result<()> {
642        let type_name = self.procedure.type_name().to_string();
643        let data = self.procedure.dump()?;
644
645        self.store
646            .store_procedure(
647                self.meta.id,
648                self.step,
649                type_name,
650                data,
651                self.meta.parent_id,
652            )
653            .await
654            .map_err(|e| {
655                error!(
656                    e; "Failed to persist procedure {}-{}",
657                    self.procedure.type_name(),
658                    self.meta.id
659                );
660                e
661            })?;
662        self.step += 1;
663        Ok(())
664    }
665
666    async fn commit_procedure(&mut self) -> Result<()> {
667        self.store
668            .commit_procedure(self.meta.id, self.step)
669            .await
670            .map_err(|e| {
671                error!(
672                    e; "Failed to commit procedure {}-{}",
673                    self.procedure.type_name(),
674                    self.meta.id
675                );
676                e
677            })?;
678        self.step += 1;
679        Ok(())
680    }
681
682    async fn write_rollback_procedure_state(&mut self, error: String) -> Result<()> {
683        // Persists procedure state
684        let type_name = self.procedure.type_name().to_string();
685        let data = self.procedure.dump()?;
686        let message = ProcedureMessage {
687            type_name,
688            data,
689            parent_id: self.meta.parent_id,
690            step: self.step,
691            error: Some(error),
692        };
693        self.store
694            .rollback_procedure(self.meta.id, message)
695            .await
696            .map_err(|e| {
697                error!(
698                    e; "Failed to write rollback key for procedure {}-{}",
699                    self.procedure.type_name(),
700                    self.meta.id
701                );
702                e
703            })?;
704        self.step += 1;
705        Ok(())
706    }
707
708    fn done(&self, output: Option<Output>) {
709        // TODO(yingwen): Add files to remove list.
710        info!(
711            "Procedure {}-{} done",
712            self.procedure.type_name(),
713            self.meta.id,
714        );
715
716        // Mark the state of this procedure to done.
717        self.meta.set_state(ProcedureState::Done { output });
718    }
719}
720
721impl Drop for Runner {
722    fn drop(&mut self) {
723        self.manager_ctx.remove_runner_task(self.meta.id);
724    }
725}
726
727#[cfg(test)]
728mod tests {
729    use std::assert_matches;
730    use std::sync::Arc;
731    use std::sync::atomic::{AtomicU64, Ordering};
732
733    use async_trait::async_trait;
734    use common_error::ext::{ErrorExt, PlainError};
735    use common_error::mock::MockError;
736    use common_error::status_code::StatusCode;
737    use common_test_util::temp_dir::create_temp_dir;
738    use futures::future::join_all;
739    use futures_util::FutureExt;
740    use futures_util::future::BoxFuture;
741    use object_store::{EntryMode, ObjectStore};
742    use tokio::sync::mpsc;
743    use tokio::sync::watch::Receiver;
744
745    use super::*;
746    use crate::local::{DynamicKeyLockGuard, test_util};
747    use crate::procedure::PoisonKeys;
748    use crate::store::proc_path;
749    use crate::test_util::InMemoryPoisonStore;
750    use crate::{ContextProvider, Error, LockKey, PoisonKey, Procedure};
751
752    const ROOT_ID: &str = "9f805a1f-05f7-490c-9f91-bd56e3cc54c1";
753
754    fn new_runner(
755        meta: ProcedureMetaRef,
756        procedure: BoxedProcedure,
757        store: Arc<ProcedureStore>,
758    ) -> Runner {
759        Runner {
760            meta,
761            procedure,
762            manager_ctx: Arc::new(ManagerContext::new(
763                Arc::new(InMemoryPoisonStore::default()),
764            )),
765            step: 0,
766            exponential_builder: ExponentialBuilder::default(),
767            store,
768            rolling_back: false,
769            event_recorder: None,
770        }
771    }
772
773    async fn check_files(
774        object_store: &ObjectStore,
775        procedure_store: &ProcedureStore,
776        procedure_id: ProcedureId,
777        files: &[&str],
778    ) {
779        let dir = proc_path!(procedure_store, "{procedure_id}/");
780        let lister = object_store.list(&dir).await.unwrap();
781        let mut files_in_dir: Vec<_> = lister
782            .into_iter()
783            .filter(|x| x.metadata().mode() == EntryMode::FILE)
784            .map(|de| de.name().to_string())
785            .collect();
786        files_in_dir.sort_unstable();
787        assert_eq!(files, files_in_dir);
788    }
789
790    fn context_with_provider(
791        procedure_id: ProcedureId,
792        provider: Arc<dyn ContextProvider>,
793    ) -> Context {
794        Context {
795            procedure_id,
796            provider,
797        }
798    }
799
800    fn context_without_provider(procedure_id: ProcedureId) -> Context {
801        struct MockProvider;
802
803        #[async_trait]
804        impl ContextProvider for MockProvider {
805            async fn procedure_state(
806                &self,
807                _procedure_id: ProcedureId,
808            ) -> Result<Option<ProcedureState>> {
809                unimplemented!()
810            }
811
812            async fn procedure_state_receiver(
813                &self,
814                _procedure_id: ProcedureId,
815            ) -> Result<Option<Receiver<ProcedureState>>> {
816                unimplemented!()
817            }
818
819            async fn try_put_poison(
820                &self,
821                _key: &PoisonKey,
822                _procedure_id: ProcedureId,
823            ) -> Result<()> {
824                unimplemented!()
825            }
826
827            async fn acquire_lock(&self, _key: &StringKey) -> DynamicKeyLockGuard {
828                unimplemented!()
829            }
830        }
831
832        Context {
833            procedure_id,
834            provider: Arc::new(MockProvider),
835        }
836    }
837
838    type RollbackFn = Box<dyn FnMut(Context) -> BoxFuture<'static, Result<()>> + Send>;
839
840    struct ProcedureAdapter<F> {
841        data: String,
842        lock_key: LockKey,
843        poison_keys: PoisonKeys,
844        exec_fn: F,
845        rollback_fn: Option<RollbackFn>,
846    }
847
848    impl<F> ProcedureAdapter<F> {
849        fn new_meta(&self, uuid: &str) -> ProcedureMetaRef {
850            let mut meta = test_util::procedure_meta_for_test();
851            meta.id = ProcedureId::parse_str(uuid).unwrap();
852            meta.lock_key = self.lock_key.clone();
853            meta.poison_keys = self.poison_keys.clone();
854
855            Arc::new(meta)
856        }
857    }
858
859    #[async_trait]
860    impl<F> Procedure for ProcedureAdapter<F>
861    where
862        F: FnMut(Context) -> BoxFuture<'static, Result<Status>> + Send + Sync,
863    {
864        fn type_name(&self) -> &str {
865            "ProcedureAdapter"
866        }
867
868        async fn execute(&mut self, ctx: &Context) -> Result<Status> {
869            let f = (self.exec_fn)(ctx.clone());
870            f.await
871        }
872
873        async fn rollback(&mut self, ctx: &Context) -> Result<()> {
874            if let Some(f) = &mut self.rollback_fn {
875                return (f)(ctx.clone()).await;
876            }
877            Ok(())
878        }
879
880        fn rollback_supported(&self) -> bool {
881            self.rollback_fn.is_some()
882        }
883
884        fn dump(&self) -> Result<String> {
885            Ok(self.data.clone())
886        }
887
888        fn lock_key(&self) -> LockKey {
889            self.lock_key.clone()
890        }
891
892        fn poison_keys(&self) -> PoisonKeys {
893            self.poison_keys.clone()
894        }
895    }
896
897    async fn execute_once_normal(persist: bool, first_files: &[&str], second_files: &[&str]) {
898        let mut times = 0;
899        let exec_fn = move |_| {
900            times += 1;
901            async move {
902                if times == 1 {
903                    Ok(Status::executing(persist))
904                } else {
905                    Ok(Status::done())
906                }
907            }
908            .boxed()
909        };
910        let normal = ProcedureAdapter {
911            data: "normal".to_string(),
912            lock_key: LockKey::single_exclusive("catalog.schema.table"),
913            poison_keys: PoisonKeys::default(),
914            exec_fn,
915            rollback_fn: None,
916        };
917
918        let dir = create_temp_dir("normal");
919        let meta = normal.new_meta(ROOT_ID);
920        let ctx = context_without_provider(meta.id);
921        let object_store = test_util::new_object_store(&dir);
922        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
923        let mut runner = new_runner(meta, Box::new(normal), procedure_store.clone());
924        runner.manager_ctx.start();
925
926        runner.execute_once(&ctx).await;
927        let state = runner.meta.state();
928        assert!(state.is_running(), "{state:?}");
929        check_files(
930            &object_store,
931            &procedure_store,
932            ctx.procedure_id,
933            first_files,
934        )
935        .await;
936
937        runner.execute_once(&ctx).await;
938        let state = runner.meta.state();
939        assert!(state.is_done(), "{state:?}");
940        check_files(
941            &object_store,
942            &procedure_store,
943            ctx.procedure_id,
944            second_files,
945        )
946        .await;
947    }
948
949    #[tokio::test]
950    async fn test_execute_once_normal() {
951        execute_once_normal(
952            true,
953            &["0000000000.step"],
954            &["0000000000.step", "0000000001.commit"],
955        )
956        .await;
957    }
958
959    #[tokio::test]
960    async fn test_execute_once_normal_skip_persist() {
961        execute_once_normal(false, &[], &["0000000000.commit"]).await;
962    }
963
964    #[tokio::test]
965    async fn test_on_suspend_empty() {
966        let exec_fn = move |_| {
967            async move {
968                Ok(Status::Suspended {
969                    subprocedures: Vec::new(),
970                    persist: false,
971                })
972            }
973            .boxed()
974        };
975        let suspend = ProcedureAdapter {
976            data: "suspend".to_string(),
977            lock_key: LockKey::single_exclusive("catalog.schema.table"),
978            poison_keys: PoisonKeys::default(),
979            exec_fn,
980            rollback_fn: None,
981        };
982
983        let dir = create_temp_dir("suspend");
984        let meta = suspend.new_meta(ROOT_ID);
985        let ctx = context_without_provider(meta.id);
986        let object_store = test_util::new_object_store(&dir);
987        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
988        let mut runner = new_runner(meta, Box::new(suspend), procedure_store);
989        runner.manager_ctx.start();
990
991        runner.execute_once(&ctx).await;
992        let state = runner.meta.state();
993        assert!(state.is_running(), "{state:?}");
994    }
995
996    fn new_child_procedure(procedure_id: ProcedureId, keys: &[&str]) -> ProcedureWithId {
997        let mut times = 0;
998        let exec_fn = move |_| {
999            times += 1;
1000            async move {
1001                if times == 1 {
1002                    time::sleep(Duration::from_millis(200)).await;
1003                    Ok(Status::executing(true))
1004                } else {
1005                    Ok(Status::done())
1006                }
1007            }
1008            .boxed()
1009        };
1010        let child = ProcedureAdapter {
1011            data: "child".to_string(),
1012            lock_key: LockKey::new_exclusive(keys.iter().map(|k| k.to_string())),
1013            poison_keys: PoisonKeys::default(),
1014            exec_fn,
1015            rollback_fn: None,
1016        };
1017
1018        ProcedureWithId {
1019            id: procedure_id,
1020            procedure: Box::new(child),
1021        }
1022    }
1023
1024    #[tokio::test]
1025    async fn test_on_suspend_by_subprocedures() {
1026        let mut times = 0;
1027        let children_ids = [ProcedureId::random(), ProcedureId::random()];
1028        let keys = [
1029            &[
1030                "catalog.schema.table.region-0",
1031                "catalog.schema.table.region-1",
1032            ],
1033            &[
1034                "catalog.schema.table.region-2",
1035                "catalog.schema.table.region-3",
1036            ],
1037        ];
1038
1039        let exec_fn = move |ctx: Context| {
1040            times += 1;
1041            async move {
1042                if times == 1 {
1043                    // Submit subprocedures.
1044                    Ok(Status::Suspended {
1045                        subprocedures: children_ids
1046                            .into_iter()
1047                            .zip(keys)
1048                            .map(|(id, key_slice)| new_child_procedure(id, key_slice))
1049                            .collect(),
1050                        persist: true,
1051                    })
1052                } else {
1053                    // Wait for subprocedures.
1054                    let mut all_child_done = true;
1055                    for id in children_ids {
1056                        let is_not_done = ctx
1057                            .provider
1058                            .procedure_state(id)
1059                            .await
1060                            .unwrap()
1061                            .map(|s| !s.is_done())
1062                            .unwrap_or(true);
1063                        if is_not_done {
1064                            all_child_done = false;
1065                        }
1066                    }
1067                    if all_child_done {
1068                        Ok(Status::done())
1069                    } else {
1070                        // Return suspended to wait for notify.
1071                        Ok(Status::Suspended {
1072                            subprocedures: Vec::new(),
1073                            persist: false,
1074                        })
1075                    }
1076                }
1077            }
1078            .boxed()
1079        };
1080        let parent = ProcedureAdapter {
1081            data: "parent".to_string(),
1082            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1083            poison_keys: PoisonKeys::default(),
1084            exec_fn,
1085            rollback_fn: None,
1086        };
1087
1088        let dir = create_temp_dir("parent");
1089        let meta = parent.new_meta(ROOT_ID);
1090        let procedure_id = meta.id;
1091
1092        let object_store = test_util::new_object_store(&dir);
1093        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1094        let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store.clone());
1095        let poison_manager = Arc::new(InMemoryPoisonStore::default());
1096        let manager_ctx = Arc::new(ManagerContext::new(poison_manager));
1097        manager_ctx.start();
1098        // Manually add this procedure to the manager ctx.
1099        assert!(manager_ctx.try_insert_procedure(meta));
1100        // Replace the manager ctx.
1101        runner.manager_ctx = manager_ctx.clone();
1102
1103        runner.run().await;
1104        assert!(manager_ctx.key_lock.is_empty());
1105
1106        // Check child procedures.
1107        for child_id in children_ids {
1108            let state = manager_ctx.state(child_id).unwrap();
1109            assert!(state.is_done(), "{state:?}");
1110        }
1111        let state = manager_ctx.state(procedure_id).unwrap();
1112        assert!(state.is_done(), "{state:?}");
1113        // Files are removed.
1114        check_files(&object_store, &procedure_store, procedure_id, &[]).await;
1115
1116        tokio::time::sleep(Duration::from_millis(5)).await;
1117        // Clean outdated meta.
1118        manager_ctx.remove_outdated_meta(Duration::from_millis(1));
1119        assert!(manager_ctx.state(procedure_id).is_none());
1120        assert!(manager_ctx.finished_procedures.lock().unwrap().is_empty());
1121        for child_id in children_ids {
1122            assert!(manager_ctx.state(child_id).is_none());
1123        }
1124    }
1125
1126    #[tokio::test]
1127    async fn test_running_is_stopped() {
1128        let exec_fn = move |_| async move { Ok(Status::executing(true)) }.boxed();
1129        let normal = ProcedureAdapter {
1130            data: "normal".to_string(),
1131            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1132            poison_keys: PoisonKeys::default(),
1133            exec_fn,
1134            rollback_fn: None,
1135        };
1136
1137        let dir = create_temp_dir("test_running_is_stopped");
1138        let meta = normal.new_meta(ROOT_ID);
1139        let ctx = context_without_provider(meta.id);
1140        let object_store = test_util::new_object_store(&dir);
1141        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1142        let mut runner = new_runner(meta, Box::new(normal), procedure_store.clone());
1143        runner.manager_ctx.start();
1144
1145        runner.execute_once(&ctx).await;
1146        let state = runner.meta.state();
1147        assert!(state.is_running(), "{state:?}");
1148        check_files(
1149            &object_store,
1150            &procedure_store,
1151            ctx.procedure_id,
1152            &["0000000000.step"],
1153        )
1154        .await;
1155
1156        runner.manager_ctx.stop();
1157        runner.execute_once(&ctx).await;
1158        let state = runner.meta.state();
1159        assert!(state.is_failed(), "{state:?}");
1160        // Shouldn't write any files
1161        check_files(
1162            &object_store,
1163            &procedure_store,
1164            ctx.procedure_id,
1165            &["0000000000.step"],
1166        )
1167        .await;
1168    }
1169
1170    #[tokio::test]
1171    async fn test_running_is_stopped_on_error() {
1172        let exec_fn =
1173            |_| async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }.boxed();
1174        let normal = ProcedureAdapter {
1175            data: "fail".to_string(),
1176            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1177            poison_keys: PoisonKeys::default(),
1178            exec_fn,
1179            rollback_fn: None,
1180        };
1181
1182        let dir = create_temp_dir("test_running_is_stopped_on_error");
1183        let meta = normal.new_meta(ROOT_ID);
1184        let ctx = context_without_provider(meta.id);
1185        let object_store = test_util::new_object_store(&dir);
1186        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1187        let mut runner = new_runner(meta, Box::new(normal), procedure_store.clone());
1188        runner.manager_ctx.stop();
1189
1190        runner.execute_once(&ctx).await;
1191        let state = runner.meta.state();
1192        assert!(state.is_failed(), "{state:?}");
1193        // Shouldn't write any files
1194        check_files(&object_store, &procedure_store, ctx.procedure_id, &[]).await;
1195    }
1196
1197    #[tokio::test]
1198    async fn test_execute_on_error() {
1199        let exec_fn =
1200            |_| async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }.boxed();
1201        let fail = ProcedureAdapter {
1202            data: "fail".to_string(),
1203            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1204            poison_keys: PoisonKeys::default(),
1205            exec_fn,
1206            rollback_fn: None,
1207        };
1208
1209        let dir = create_temp_dir("fail");
1210        let meta = fail.new_meta(ROOT_ID);
1211        let ctx = context_without_provider(meta.id);
1212        let object_store = test_util::new_object_store(&dir);
1213        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1214        let mut runner = new_runner(meta.clone(), Box::new(fail), procedure_store.clone());
1215        runner.manager_ctx.start();
1216
1217        runner.execute_once(&ctx).await;
1218        let state = runner.meta.state();
1219        assert!(state.is_failed(), "{state:?}");
1220        check_files(&object_store, &procedure_store, ctx.procedure_id, &[]).await;
1221    }
1222
1223    #[tokio::test]
1224    async fn test_execute_with_rollback_on_error() {
1225        let exec_fn =
1226            |_| async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }.boxed();
1227        let rollback_fn = move |_| async move { Ok(()) }.boxed();
1228        let fail = ProcedureAdapter {
1229            data: "fail".to_string(),
1230            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1231            poison_keys: PoisonKeys::default(),
1232            exec_fn,
1233            rollback_fn: Some(Box::new(rollback_fn)),
1234        };
1235
1236        let dir = create_temp_dir("fail");
1237        let meta = fail.new_meta(ROOT_ID);
1238        let ctx = context_without_provider(meta.id);
1239        let object_store = test_util::new_object_store(&dir);
1240        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1241        let mut runner = new_runner(meta.clone(), Box::new(fail), procedure_store.clone());
1242        runner.manager_ctx.start();
1243
1244        runner.execute_once(&ctx).await;
1245        let state = runner.meta.state();
1246        assert!(state.is_prepare_rollback(), "{state:?}");
1247
1248        runner.execute_once(&ctx).await;
1249        let state = runner.meta.state();
1250        assert!(state.is_rolling_back(), "{state:?}");
1251
1252        runner.execute_once(&ctx).await;
1253        let state = runner.meta.state();
1254        assert!(state.is_failed(), "{state:?}");
1255        check_files(
1256            &object_store,
1257            &procedure_store,
1258            ctx.procedure_id,
1259            &["0000000000.rollback"],
1260        )
1261        .await;
1262    }
1263
1264    #[tokio::test]
1265    async fn test_execute_on_retry_later_error() {
1266        let mut times = 0;
1267
1268        let exec_fn = move |_| {
1269            times += 1;
1270            async move {
1271                if times == 1 {
1272                    Err(Error::retry_later(MockError::new(StatusCode::Unexpected)))
1273                } else if times == 2 {
1274                    Ok(Status::executing(false))
1275                } else {
1276                    Ok(Status::done())
1277                }
1278            }
1279            .boxed()
1280        };
1281
1282        let retry_later = ProcedureAdapter {
1283            data: "retry_later".to_string(),
1284            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1285            poison_keys: PoisonKeys::default(),
1286            exec_fn,
1287            rollback_fn: None,
1288        };
1289
1290        let dir = create_temp_dir("retry_later");
1291        let meta = retry_later.new_meta(ROOT_ID);
1292        let ctx = context_without_provider(meta.id);
1293        let object_store = test_util::new_object_store(&dir);
1294        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1295        let mut runner = new_runner(meta.clone(), Box::new(retry_later), procedure_store.clone());
1296        runner.manager_ctx.start();
1297        runner.execute_once(&ctx).await;
1298        let state = runner.meta.state();
1299        assert!(state.is_retrying(), "{state:?}");
1300
1301        runner.execute_once(&ctx).await;
1302        let state = runner.meta.state();
1303        assert!(state.is_running(), "{state:?}");
1304
1305        runner.execute_once(&ctx).await;
1306        let state = runner.meta.state();
1307        assert!(state.is_done(), "{state:?}");
1308        assert!(meta.state().is_done());
1309        check_files(
1310            &object_store,
1311            &procedure_store,
1312            ctx.procedure_id,
1313            &["0000000000.commit"],
1314        )
1315        .await;
1316    }
1317
1318    #[tokio::test]
1319    async fn test_retrying_state_visible_in_context_on_retry() {
1320        let retrying_states = Arc::new(std::sync::Mutex::new(Vec::new()));
1321        let captured = retrying_states.clone();
1322        let mut times = 0;
1323
1324        let exec_fn = move |ctx: Context| {
1325            times += 1;
1326            let captured = captured.clone();
1327            async move {
1328                let is_retrying = ctx.is_retrying().await;
1329                captured.lock().unwrap().push(is_retrying);
1330                if times == 1 {
1331                    Err(Error::retry_later(MockError::new(StatusCode::Unexpected)))
1332                } else {
1333                    Ok(Status::done())
1334                }
1335            }
1336            .boxed()
1337        };
1338
1339        let procedure = ProcedureAdapter {
1340            data: "retrying_state".to_string(),
1341            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1342            poison_keys: PoisonKeys::default(),
1343            exec_fn,
1344            rollback_fn: None,
1345        };
1346
1347        let dir = create_temp_dir("retrying_state");
1348        let meta = procedure.new_meta(ROOT_ID);
1349        let object_store = test_util::new_object_store(&dir);
1350        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store));
1351        let mut runner = new_runner(meta.clone(), Box::new(procedure), procedure_store);
1352        let ctx = context_with_provider(
1353            meta.id,
1354            runner.manager_ctx.clone() as Arc<dyn ContextProvider>,
1355        );
1356
1357        runner
1358            .manager_ctx
1359            .procedures
1360            .write()
1361            .unwrap()
1362            .insert(meta.id, runner.meta.clone());
1363        runner.manager_ctx.start();
1364
1365        runner.execute_once(&ctx).await;
1366        runner.execute_once(&ctx).await;
1367
1368        let states = retrying_states.lock().unwrap().clone();
1369        assert_eq!(states, vec![Some(false), Some(true)]);
1370    }
1371
1372    #[tokio::test(flavor = "multi_thread")]
1373    async fn test_execute_on_retry_later_error_with_child() {
1374        common_telemetry::init_default_ut_logging();
1375        let mut times = 0;
1376        let child_id = ProcedureId::random();
1377
1378        let exec_fn = move |_| {
1379            times += 1;
1380            async move {
1381                debug!("times: {}", times);
1382                if times == 1 {
1383                    Err(Error::retry_later(MockError::new(StatusCode::Unexpected)))
1384                } else if times == 2 {
1385                    let exec_fn = |_| {
1386                        async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }
1387                            .boxed()
1388                    };
1389                    let fail = ProcedureAdapter {
1390                        data: "fail".to_string(),
1391                        lock_key: LockKey::single_exclusive("catalog.schema.table.region-0"),
1392                        poison_keys: PoisonKeys::default(),
1393                        exec_fn,
1394                        rollback_fn: None,
1395                    };
1396
1397                    Ok(Status::Suspended {
1398                        subprocedures: vec![ProcedureWithId {
1399                            id: child_id,
1400                            procedure: Box::new(fail),
1401                        }],
1402                        persist: true,
1403                    })
1404                } else {
1405                    Ok(Status::done())
1406                }
1407            }
1408            .boxed()
1409        };
1410
1411        let retry_later = ProcedureAdapter {
1412            data: "retry_later".to_string(),
1413            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1414            poison_keys: PoisonKeys::default(),
1415            exec_fn,
1416            rollback_fn: None,
1417        };
1418
1419        let dir = create_temp_dir("retry_later");
1420        let meta = retry_later.new_meta(ROOT_ID);
1421        let ctx = context_without_provider(meta.id);
1422        let object_store = test_util::new_object_store(&dir);
1423        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1424        let mut runner = new_runner(meta.clone(), Box::new(retry_later), procedure_store.clone());
1425        runner.manager_ctx.start();
1426        debug!("execute_once 1");
1427        runner.execute_once(&ctx).await;
1428        let state = runner.meta.state();
1429        assert!(state.is_retrying(), "{state:?}");
1430
1431        let moved_meta = meta.clone();
1432        tokio::spawn(async move {
1433            moved_meta.child_notify.notify_one();
1434        });
1435        runner.execute_once(&ctx).await;
1436        let state = runner.meta.state();
1437        assert!(state.is_running(), "{state:?}");
1438
1439        runner.execute_once(&ctx).await;
1440        let state = runner.meta.state();
1441        assert!(state.is_done(), "{state:?}");
1442        assert!(meta.state().is_done());
1443        check_files(
1444            &object_store,
1445            &procedure_store,
1446            ctx.procedure_id,
1447            &["0000000000.step", "0000000001.commit"],
1448        )
1449        .await;
1450    }
1451
1452    #[tokio::test]
1453    async fn test_execute_exceed_max_retry_later() {
1454        let exec_fn =
1455            |_| async { Err(Error::retry_later(MockError::new(StatusCode::Unexpected))) }.boxed();
1456
1457        let exceed_max_retry_later = ProcedureAdapter {
1458            data: "exceed_max_retry_later".to_string(),
1459            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1460            poison_keys: PoisonKeys::default(),
1461            exec_fn,
1462            rollback_fn: None,
1463        };
1464
1465        let dir = create_temp_dir("exceed_max_retry_later");
1466        let meta = exceed_max_retry_later.new_meta(ROOT_ID);
1467        let object_store = test_util::new_object_store(&dir);
1468        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1469        let mut runner = new_runner(
1470            meta.clone(),
1471            Box::new(exceed_max_retry_later),
1472            procedure_store,
1473        );
1474        runner.manager_ctx.start();
1475
1476        runner.exponential_builder = ExponentialBuilder::default()
1477            .with_min_delay(Duration::from_millis(1))
1478            .with_max_times(3);
1479
1480        // Run the runner and execute the procedure.
1481        runner.execute_procedure_in_loop().await;
1482        let err = meta.state().error().unwrap().to_string();
1483        assert!(err.contains("Procedure retry exceeded max times"));
1484    }
1485
1486    #[tokio::test]
1487    async fn test_rollback_exceed_max_retry_later() {
1488        let exec_fn =
1489            |_| async { Err(Error::retry_later(MockError::new(StatusCode::Unexpected))) }.boxed();
1490        let rollback_fn = move |_| {
1491            async move { Err(Error::retry_later(MockError::new(StatusCode::Unexpected))) }.boxed()
1492        };
1493        let exceed_max_retry_later = ProcedureAdapter {
1494            data: "exceed_max_rollback".to_string(),
1495            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1496            poison_keys: PoisonKeys::default(),
1497            exec_fn,
1498            rollback_fn: Some(Box::new(rollback_fn)),
1499        };
1500
1501        let dir = create_temp_dir("exceed_max_rollback");
1502        let meta = exceed_max_retry_later.new_meta(ROOT_ID);
1503        let object_store = test_util::new_object_store(&dir);
1504        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1505        let mut runner = new_runner(
1506            meta.clone(),
1507            Box::new(exceed_max_retry_later),
1508            procedure_store,
1509        );
1510        runner.manager_ctx.start();
1511        runner.exponential_builder = ExponentialBuilder::default()
1512            .with_min_delay(Duration::from_millis(1))
1513            .with_max_times(3);
1514
1515        // Run the runner and execute the procedure.
1516        runner.execute_procedure_in_loop().await;
1517        let err = meta.state().error().unwrap().to_string();
1518        assert!(err.contains("Procedure rollback exceeded max times"));
1519    }
1520
1521    #[tokio::test]
1522    async fn test_rollback_after_retry_fail() {
1523        let exec_fn = move |_| {
1524            async move { Err(Error::retry_later(MockError::new(StatusCode::Unexpected))) }.boxed()
1525        };
1526
1527        let (tx, mut rx) = mpsc::channel(1);
1528        let rollback_fn = move |_| {
1529            let tx = tx.clone();
1530            async move {
1531                tx.send(()).await.unwrap();
1532                Ok(())
1533            }
1534            .boxed()
1535        };
1536        let retry_later = ProcedureAdapter {
1537            data: "rollback_after_retry_fail".to_string(),
1538            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1539            poison_keys: PoisonKeys::default(),
1540            exec_fn,
1541            rollback_fn: Some(Box::new(rollback_fn)),
1542        };
1543
1544        let dir = create_temp_dir("retry_later");
1545        let meta = retry_later.new_meta(ROOT_ID);
1546        let ctx = context_without_provider(meta.id);
1547        let object_store = test_util::new_object_store(&dir);
1548        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1549        let mut runner = new_runner(meta.clone(), Box::new(retry_later), procedure_store.clone());
1550        runner.manager_ctx.start();
1551        runner.exponential_builder = ExponentialBuilder::default()
1552            .with_min_delay(Duration::from_millis(1))
1553            .with_max_times(3);
1554        // Run the runner and execute the procedure.
1555        runner.execute_procedure_in_loop().await;
1556        rx.recv().await.unwrap();
1557        assert_eq!(rx.try_recv().unwrap_err(), mpsc::error::TryRecvError::Empty);
1558        check_files(
1559            &object_store,
1560            &procedure_store,
1561            ctx.procedure_id,
1562            &["0000000000.rollback"],
1563        )
1564        .await;
1565    }
1566
1567    #[tokio::test]
1568    async fn test_child_error() {
1569        let mut times = 0;
1570        let child_id = ProcedureId::random();
1571        common_telemetry::init_default_ut_logging();
1572        let exec_fn = move |ctx: Context| {
1573            times += 1;
1574            async move {
1575                if times == 1 {
1576                    // Submit subprocedures.
1577                    let exec_fn = |_| {
1578                        async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }
1579                            .boxed()
1580                    };
1581                    let fail = ProcedureAdapter {
1582                        data: "fail".to_string(),
1583                        lock_key: LockKey::single_exclusive("catalog.schema.table.region-0"),
1584                        poison_keys: PoisonKeys::default(),
1585                        exec_fn,
1586                        rollback_fn: None,
1587                    };
1588
1589                    Ok(Status::Suspended {
1590                        subprocedures: vec![ProcedureWithId {
1591                            id: child_id,
1592                            procedure: Box::new(fail),
1593                        }],
1594                        persist: true,
1595                    })
1596                } else {
1597                    // Wait for subprocedures.
1598                    let state = ctx.provider.procedure_state(child_id).await.unwrap();
1599                    let is_failed = state.map(|s| s.is_failed()).unwrap_or(false);
1600                    if is_failed {
1601                        // The parent procedure to abort itself if child procedure is failed.
1602                        Err(Error::from_error_ext(PlainError::new(
1603                            "subprocedure failed".to_string(),
1604                            StatusCode::Unexpected,
1605                        )))
1606                    } else {
1607                        // Return suspended to wait for notify.
1608                        Ok(Status::Suspended {
1609                            subprocedures: Vec::new(),
1610                            persist: false,
1611                        })
1612                    }
1613                }
1614            }
1615            .boxed()
1616        };
1617        let parent = ProcedureAdapter {
1618            data: "parent".to_string(),
1619            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1620            poison_keys: PoisonKeys::default(),
1621            exec_fn,
1622            rollback_fn: None,
1623        };
1624
1625        let dir = create_temp_dir("child_err");
1626        let meta = parent.new_meta(ROOT_ID);
1627
1628        let object_store = test_util::new_object_store(&dir);
1629        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1630        let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store);
1631        let poison_manager = Arc::new(InMemoryPoisonStore::default());
1632        let manager_ctx = Arc::new(ManagerContext::new(poison_manager));
1633        manager_ctx.start();
1634        // Manually add this procedure to the manager ctx.
1635        assert!(manager_ctx.try_insert_procedure(meta.clone()));
1636        // Replace the manager ctx.
1637        runner.manager_ctx = manager_ctx.clone();
1638
1639        // Run the runner and execute the procedure.
1640        runner.run().await;
1641        assert!(manager_ctx.key_lock.is_empty());
1642        let err = meta.state().error().unwrap().output_msg();
1643        assert!(err.contains("subprocedure failed"), "{err}");
1644    }
1645
1646    #[tokio::test]
1647    async fn test_execute_with_clean_poisons() {
1648        common_telemetry::init_default_ut_logging();
1649        let mut times = 0;
1650        let poison_key = PoisonKey::new("table/1024");
1651        let moved_poison_key = poison_key.clone();
1652        let exec_fn = move |ctx: Context| {
1653            times += 1;
1654            let poison_key = moved_poison_key.clone();
1655            async move {
1656                if times == 1 {
1657                    // Put the poison to the context.
1658                    ctx.provider
1659                        .try_put_poison(&poison_key, ctx.procedure_id)
1660                        .await
1661                        .unwrap();
1662
1663                    Ok(Status::executing(true))
1664                } else {
1665                    Ok(Status::executing_with_clean_poisons(true))
1666                }
1667            }
1668            .boxed()
1669        };
1670        let poison = ProcedureAdapter {
1671            data: "poison".to_string(),
1672            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1673            poison_keys: PoisonKeys::new(vec![poison_key.clone()]),
1674            exec_fn,
1675            rollback_fn: None,
1676        };
1677
1678        let dir = create_temp_dir("clean_poisons");
1679        let meta = poison.new_meta(ROOT_ID);
1680
1681        let object_store = test_util::new_object_store(&dir);
1682        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1683        let mut runner = new_runner(meta.clone(), Box::new(poison), procedure_store.clone());
1684
1685        // Use the manager ctx as the context provider.
1686        let ctx = context_with_provider(
1687            meta.id,
1688            runner.manager_ctx.clone() as Arc<dyn ContextProvider>,
1689        );
1690        // Manually add this procedure to the manager ctx.
1691        runner
1692            .manager_ctx
1693            .procedures
1694            .write()
1695            .unwrap()
1696            .insert(meta.id, runner.meta.clone());
1697
1698        runner.manager_ctx.start();
1699        runner.execute_once(&ctx).await;
1700        let state = runner.meta.state();
1701        assert!(state.is_running(), "{state:?}");
1702
1703        let procedure_id = runner
1704            .manager_ctx
1705            .poison_manager
1706            .get_poison(&poison_key.to_string())
1707            .await
1708            .unwrap();
1709        // poison key should be exist.
1710        assert!(procedure_id.is_some());
1711
1712        runner.execute_once(&ctx).await;
1713        let state = runner.meta.state();
1714        assert!(state.is_running(), "{state:?}");
1715
1716        let procedure_id = runner
1717            .manager_ctx
1718            .poison_manager
1719            .get_poison(&poison_key.to_string())
1720            .await
1721            .unwrap();
1722        // poison key should be deleted.
1723        assert!(procedure_id.is_none());
1724    }
1725
1726    #[tokio::test]
1727    async fn test_execute_error_with_clean_poisons() {
1728        common_telemetry::init_default_ut_logging();
1729        let mut times = 0;
1730        let poison_key = PoisonKey::new("table/1024");
1731        let moved_poison_key = poison_key.clone();
1732        let exec_fn = move |ctx: Context| {
1733            times += 1;
1734            let poison_key = moved_poison_key.clone();
1735            async move {
1736                if times == 1 {
1737                    // Put the poison to the context.
1738                    ctx.provider
1739                        .try_put_poison(&poison_key, ctx.procedure_id)
1740                        .await
1741                        .unwrap();
1742
1743                    Ok(Status::executing(true))
1744                } else {
1745                    Err(Error::external_and_clean_poisons(MockError::new(
1746                        StatusCode::Unexpected,
1747                    )))
1748                }
1749            }
1750            .boxed()
1751        };
1752        let poison = ProcedureAdapter {
1753            data: "poison".to_string(),
1754            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1755            poison_keys: PoisonKeys::new(vec![poison_key.clone()]),
1756            exec_fn,
1757            rollback_fn: None,
1758        };
1759
1760        let dir = create_temp_dir("error_with_clean_poisons");
1761        let meta = poison.new_meta(ROOT_ID);
1762
1763        let object_store = test_util::new_object_store(&dir);
1764        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1765        let mut runner = new_runner(meta.clone(), Box::new(poison), procedure_store.clone());
1766
1767        // Use the manager ctx as the context provider.
1768        let ctx = context_with_provider(
1769            meta.id,
1770            runner.manager_ctx.clone() as Arc<dyn ContextProvider>,
1771        );
1772        // Manually add this procedure to the manager ctx.
1773        runner
1774            .manager_ctx
1775            .procedures
1776            .write()
1777            .unwrap()
1778            .insert(meta.id, runner.meta.clone());
1779
1780        runner.manager_ctx.start();
1781        runner.execute_once(&ctx).await;
1782        let state = runner.meta.state();
1783        assert!(state.is_running(), "{state:?}");
1784
1785        let procedure_id = runner
1786            .manager_ctx
1787            .poison_manager
1788            .get_poison(&poison_key.to_string())
1789            .await
1790            .unwrap();
1791        // poison key should be exist.
1792        assert!(procedure_id.is_some());
1793
1794        runner.execute_once(&ctx).await;
1795        let state = runner.meta.state();
1796        assert!(state.is_failed(), "{state:?}");
1797
1798        let procedure_id = runner
1799            .manager_ctx
1800            .poison_manager
1801            .get_poison(&poison_key.to_string())
1802            .await
1803            .unwrap();
1804        // poison key should be deleted.
1805        assert!(procedure_id.is_none());
1806    }
1807
1808    #[tokio::test]
1809    async fn test_execute_failed_after_set_poison() {
1810        let mut times = 0;
1811        let poison_key = PoisonKey::new("table/1024");
1812        let moved_poison_key = poison_key.clone();
1813        let exec_fn = move |ctx: Context| {
1814            times += 1;
1815            let poison_key = moved_poison_key.clone();
1816            async move {
1817                if times == 1 {
1818                    Ok(Status::executing(true))
1819                } else {
1820                    // Put the poison to the context.
1821                    ctx.provider
1822                        .try_put_poison(&poison_key, ctx.procedure_id)
1823                        .await
1824                        .unwrap();
1825                    Err(Error::external(MockError::new(StatusCode::Unexpected)))
1826                }
1827            }
1828            .boxed()
1829        };
1830        let poison = ProcedureAdapter {
1831            data: "poison".to_string(),
1832            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1833            poison_keys: PoisonKeys::new(vec![poison_key.clone()]),
1834            exec_fn,
1835            rollback_fn: None,
1836        };
1837
1838        let dir = create_temp_dir("poison");
1839        let meta = poison.new_meta(ROOT_ID);
1840
1841        let object_store = test_util::new_object_store(&dir);
1842        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1843        let mut runner = new_runner(meta.clone(), Box::new(poison), procedure_store.clone());
1844
1845        // Use the manager ctx as the context provider.
1846        let ctx = context_with_provider(
1847            meta.id,
1848            runner.manager_ctx.clone() as Arc<dyn ContextProvider>,
1849        );
1850        // Manually add this procedure to the manager ctx.
1851        runner
1852            .manager_ctx
1853            .procedures
1854            .write()
1855            .unwrap()
1856            .insert(meta.id, runner.meta.clone());
1857
1858        runner.manager_ctx.start();
1859        runner.execute_once(&ctx).await;
1860        let state = runner.meta.state();
1861        assert!(state.is_running(), "{state:?}");
1862
1863        runner.execute_once(&ctx).await;
1864        let state = runner.meta.state();
1865        assert!(state.is_failed(), "{state:?}");
1866        assert!(meta.state().is_failed());
1867
1868        // Check the poison is set.
1869        let procedure_id = runner
1870            .manager_ctx
1871            .poison_manager
1872            .get_poison(&poison_key.to_string())
1873            .await
1874            .unwrap()
1875            .unwrap();
1876
1877        // If the procedure is poisoned, the poison key shouldn't be deleted.
1878        assert_eq!(&procedure_id.clone(), ROOT_ID);
1879    }
1880
1881    #[tokio::test]
1882    async fn test_execute_exceed_max_retry_after_set_poison() {
1883        common_telemetry::init_default_ut_logging();
1884        let mut times = 0;
1885        let poison_key = PoisonKey::new("table/1024");
1886        let moved_poison_key = poison_key.clone();
1887        let exec_fn = move |ctx: Context| {
1888            times += 1;
1889            let poison_key = moved_poison_key.clone();
1890            async move {
1891                if times == 1 {
1892                    Ok(Status::executing(true))
1893                } else {
1894                    // Put the poison to the context.
1895                    ctx.provider
1896                        .try_put_poison(&poison_key, ctx.procedure_id)
1897                        .await
1898                        .unwrap();
1899                    Err(Error::retry_later_and_clean_poisons(MockError::new(
1900                        StatusCode::Unexpected,
1901                    )))
1902                }
1903            }
1904            .boxed()
1905        };
1906        let poison = ProcedureAdapter {
1907            data: "poison".to_string(),
1908            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1909            poison_keys: PoisonKeys::new(vec![poison_key.clone()]),
1910            exec_fn,
1911            rollback_fn: None,
1912        };
1913
1914        let dir = create_temp_dir("exceed_max_after_set_poison");
1915        let meta = poison.new_meta(ROOT_ID);
1916        let object_store = test_util::new_object_store(&dir);
1917        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1918        let mut runner = new_runner(meta.clone(), Box::new(poison), procedure_store);
1919        runner.manager_ctx.start();
1920        runner.exponential_builder = ExponentialBuilder::default()
1921            .with_min_delay(Duration::from_millis(1))
1922            .with_max_times(3);
1923        // Use the manager ctx as the context provider.
1924        let ctx = context_with_provider(
1925            meta.id,
1926            runner.manager_ctx.clone() as Arc<dyn ContextProvider>,
1927        );
1928        // Manually add this procedure to the manager ctx.
1929        runner
1930            .manager_ctx
1931            .procedures
1932            .write()
1933            .unwrap()
1934            .insert(meta.id, runner.meta.clone());
1935        // Run the runner and execute the procedure.
1936        runner.execute_once_with_retry(&ctx).await;
1937        let err = meta.state().error().unwrap().clone();
1938        assert_matches!(&*err, Error::RetryTimesExceeded { .. });
1939
1940        // Check the poison is deleted.
1941        let procedure_id = runner
1942            .manager_ctx
1943            .poison_manager
1944            .get_poison(&poison_key.to_string())
1945            .await
1946            .unwrap();
1947        assert_eq!(procedure_id, None);
1948    }
1949
1950    #[tokio::test]
1951    async fn test_execute_poisoned() {
1952        let mut times = 0;
1953        let poison_key = PoisonKey::new("table/1024");
1954        let moved_poison_key = poison_key.clone();
1955        let exec_fn = move |ctx: Context| {
1956            times += 1;
1957            let poison_key = moved_poison_key.clone();
1958            async move {
1959                if times == 1 {
1960                    Ok(Status::executing(true))
1961                } else {
1962                    // Put the poison to the context.
1963                    ctx.provider
1964                        .try_put_poison(&poison_key, ctx.procedure_id)
1965                        .await
1966                        .unwrap();
1967                    Ok(Status::Poisoned {
1968                        keys: PoisonKeys::new(vec![poison_key.clone()]),
1969                        error: Error::external(MockError::new(StatusCode::Unexpected)),
1970                    })
1971                }
1972            }
1973            .boxed()
1974        };
1975        let poison = ProcedureAdapter {
1976            data: "poison".to_string(),
1977            lock_key: LockKey::single_exclusive("catalog.schema.table"),
1978            poison_keys: PoisonKeys::new(vec![poison_key.clone()]),
1979            exec_fn,
1980            rollback_fn: None,
1981        };
1982
1983        let dir = create_temp_dir("poison");
1984        let meta = poison.new_meta(ROOT_ID);
1985
1986        let object_store = test_util::new_object_store(&dir);
1987        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
1988        let mut runner = new_runner(meta.clone(), Box::new(poison), procedure_store.clone());
1989
1990        // Use the manager ctx as the context provider.
1991        let ctx = context_with_provider(
1992            meta.id,
1993            runner.manager_ctx.clone() as Arc<dyn ContextProvider>,
1994        );
1995        // Manually add this procedure to the manager ctx.
1996        runner
1997            .manager_ctx
1998            .procedures
1999            .write()
2000            .unwrap()
2001            .insert(meta.id, runner.meta.clone());
2002
2003        runner.manager_ctx.start();
2004        runner.execute_once(&ctx).await;
2005        let state = runner.meta.state();
2006        assert!(state.is_running(), "{state:?}");
2007
2008        runner.execute_once(&ctx).await;
2009        let state = runner.meta.state();
2010        assert!(state.is_poisoned(), "{state:?}");
2011        assert!(meta.state().is_poisoned());
2012        check_files(
2013            &object_store,
2014            &procedure_store,
2015            ctx.procedure_id,
2016            &["0000000000.step"],
2017        )
2018        .await;
2019
2020        // Check the poison is set.
2021        let procedure_id = runner
2022            .manager_ctx
2023            .poison_manager
2024            .get_poison(&poison_key.to_string())
2025            .await
2026            .unwrap()
2027            .unwrap();
2028
2029        // If the procedure is poisoned, the poison key shouldn't be deleted.
2030        assert_eq!(procedure_id, ROOT_ID);
2031    }
2032
2033    fn test_procedure_with_dynamic_lock(
2034        shared_atomic_value: Arc<AtomicU64>,
2035        id: u64,
2036    ) -> (BoxedProcedure, Arc<ProcedureMeta>) {
2037        let exec_fn = move |ctx: Context| {
2038            let moved_shared_atomic_value = shared_atomic_value.clone();
2039            let moved_ctx = ctx.clone();
2040            async move {
2041                debug!("Acquiring write lock, id: {}", id);
2042                let key = StringKey::Exclusive("test_lock".to_string());
2043                let guard = moved_ctx.provider.acquire_lock(&key).await;
2044                debug!("Acquired write lock, id: {}", id);
2045                let millis = rand::rng().random_range(10..=50);
2046                tokio::time::sleep(Duration::from_millis(millis)).await;
2047                let value = moved_shared_atomic_value.load(Ordering::Relaxed);
2048                moved_shared_atomic_value.store(value + 1, Ordering::Relaxed);
2049                debug!("Dropping write lock, id: {}", id);
2050                drop(guard);
2051
2052                Ok(Status::done())
2053            }
2054            .boxed()
2055        };
2056
2057        let adapter = ProcedureAdapter {
2058            data: "dynamic_lock".to_string(),
2059            lock_key: LockKey::new_exclusive([]),
2060            poison_keys: PoisonKeys::new([]),
2061            exec_fn,
2062            rollback_fn: None,
2063        };
2064        let meta = adapter.new_meta(ROOT_ID);
2065
2066        (Box::new(adapter), meta)
2067    }
2068
2069    #[tokio::test(flavor = "multi_thread")]
2070    async fn test_execute_with_dynamic_lock() {
2071        common_telemetry::init_default_ut_logging();
2072        let shared_atomic_value = Arc::new(AtomicU64::new(0));
2073        let (procedure1, meta1) = test_procedure_with_dynamic_lock(shared_atomic_value.clone(), 1);
2074        let (procedure2, meta2) = test_procedure_with_dynamic_lock(shared_atomic_value.clone(), 2);
2075
2076        let dir = create_temp_dir("dynamic_lock");
2077        let object_store = test_util::new_object_store(&dir);
2078        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
2079        let mut runner1 = new_runner(meta1.clone(), procedure1, procedure_store.clone());
2080        let mut runner2 = new_runner(meta2.clone(), procedure2, procedure_store.clone());
2081        let ctx1 = context_with_provider(
2082            meta1.id,
2083            runner1.manager_ctx.clone() as Arc<dyn ContextProvider>,
2084        );
2085        let ctx2 = context_with_provider(
2086            meta2.id,
2087            // use same manager ctx as runner1
2088            runner1.manager_ctx.clone() as Arc<dyn ContextProvider>,
2089        );
2090        let tasks = [runner1.execute_once(&ctx1), runner2.execute_once(&ctx2)];
2091        join_all(tasks).await;
2092        assert_eq!(shared_atomic_value.load(Ordering::Relaxed), 2);
2093    }
2094    #[tokio::test]
2095    async fn test_on_suspend_deadlock_detected_no_rollback() {
2096        // Parent holds Exclusive("catalog.schema.table"), child also requests Exclusive("catalog.schema.table").
2097        // Since parent does NOT support rollback, state should become Failed.
2098        let child_id = ProcedureId::random();
2099        let exec_fn = move |_| {
2100            async move {
2101                let child_exec_fn = |_| async { Ok(Status::done()) }.boxed();
2102                let child = ProcedureAdapter {
2103                    data: "child".to_string(),
2104                    lock_key: LockKey::single_exclusive("catalog.schema.table"),
2105                    poison_keys: PoisonKeys::default(),
2106                    exec_fn: child_exec_fn,
2107                    rollback_fn: None,
2108                };
2109                Ok(Status::Suspended {
2110                    subprocedures: vec![ProcedureWithId {
2111                        id: child_id,
2112                        procedure: Box::new(child),
2113                    }],
2114                    persist: false,
2115                })
2116            }
2117            .boxed()
2118        };
2119        let parent = ProcedureAdapter {
2120            data: "parent".to_string(),
2121            lock_key: LockKey::single_exclusive("catalog.schema.table"),
2122            poison_keys: PoisonKeys::default(),
2123            exec_fn,
2124            rollback_fn: None, // No rollback support
2125        };
2126
2127        let dir = create_temp_dir("deadlock_no_rollback");
2128        let meta = parent.new_meta(ROOT_ID);
2129        let ctx = context_without_provider(meta.id);
2130        let object_store = test_util::new_object_store(&dir);
2131        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
2132        let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store);
2133        runner.manager_ctx.start();
2134
2135        runner.execute_once(&ctx).await;
2136        let state = runner.meta.state();
2137        assert!(state.is_failed(), "Expected Failed, got {state:?}");
2138        // Verify the error exists
2139        assert!(
2140            state.error().is_some(),
2141            "Failed state should contain an error"
2142        );
2143        // Child should NOT have been submitted
2144        assert!(
2145            !runner.manager_ctx.contains_procedure(child_id),
2146            "Child procedure should not be submitted when deadlock is detected"
2147        );
2148    }
2149
2150    #[tokio::test]
2151    async fn test_on_suspend_deadlock_detected_with_rollback() {
2152        // Parent holds Exclusive("catalog.schema.table"), child also requests Exclusive("catalog.schema.table").
2153        // Since parent DOES support rollback, state should become PrepareRollback.
2154        let child_id = ProcedureId::random();
2155        let exec_fn = move |_| {
2156            async move {
2157                let child_exec_fn = |_| async { Ok(Status::done()) }.boxed();
2158                let child = ProcedureAdapter {
2159                    data: "child".to_string(),
2160                    lock_key: LockKey::single_exclusive("catalog.schema.table"),
2161                    poison_keys: PoisonKeys::default(),
2162                    exec_fn: child_exec_fn,
2163                    rollback_fn: None,
2164                };
2165                Ok(Status::Suspended {
2166                    subprocedures: vec![ProcedureWithId {
2167                        id: child_id,
2168                        procedure: Box::new(child),
2169                    }],
2170                    persist: false,
2171                })
2172            }
2173            .boxed()
2174        };
2175        let rollback_fn = move |_| async move { Ok(()) }.boxed();
2176        let parent = ProcedureAdapter {
2177            data: "parent".to_string(),
2178            lock_key: LockKey::single_exclusive("catalog.schema.table"),
2179            poison_keys: PoisonKeys::default(),
2180            exec_fn,
2181            rollback_fn: Some(Box::new(rollback_fn)), // Supports rollback
2182        };
2183
2184        let dir = create_temp_dir("deadlock_with_rollback");
2185        let meta = parent.new_meta(ROOT_ID);
2186        let ctx = context_without_provider(meta.id);
2187        let object_store = test_util::new_object_store(&dir);
2188        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
2189        let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store);
2190        runner.manager_ctx.start();
2191
2192        runner.execute_once(&ctx).await;
2193        let state = runner.meta.state();
2194        assert!(
2195            state.is_prepare_rollback(),
2196            "Expected PrepareRollback, got {state:?}"
2197        );
2198        // Verify the error exists in PrepareRollback variant
2199        match &state {
2200            ProcedureState::PrepareRollback { error } => {
2201                assert!(!error.to_string().is_empty(), "Error should not be empty");
2202            }
2203            _ => panic!("Expected PrepareRollback, got {state:?}"),
2204        }
2205        // Child should NOT have been submitted
2206        assert!(
2207            !runner.manager_ctx.contains_procedure(child_id),
2208            "Child procedure should not be submitted when deadlock is detected"
2209        );
2210    }
2211
2212    #[test]
2213    fn test_find_lock_conflicts() {
2214        use crate::procedure::StringKey;
2215
2216        // 1. Share + Share = No conflict (Compatible)
2217        let parent = [StringKey::Share("A".to_string())];
2218        let child = [StringKey::Share("A".to_string())];
2219        assert!(super::find_lock_conflicts(parent.iter(), child.iter()).is_empty());
2220
2221        // 2. Share + Exclusive = Conflict
2222        let parent = [StringKey::Share("A".to_string())];
2223        let child = [StringKey::Exclusive("A".to_string())];
2224        assert_eq!(
2225            super::find_lock_conflicts(parent.iter(), child.iter()),
2226            vec!["A".to_string()]
2227        );
2228
2229        // 3. Exclusive + Share = Conflict
2230        let parent = [StringKey::Exclusive("A".to_string())];
2231        let child = [StringKey::Share("A".to_string())];
2232        assert_eq!(
2233            super::find_lock_conflicts(parent.iter(), child.iter()),
2234            vec!["A".to_string()]
2235        );
2236
2237        // 4. Exclusive + Exclusive = Conflict
2238        let parent = [StringKey::Exclusive("A".to_string())];
2239        let child = [StringKey::Exclusive("A".to_string())];
2240        assert_eq!(
2241            super::find_lock_conflicts(parent.iter(), child.iter()),
2242            vec!["A".to_string()]
2243        );
2244
2245        // 5. Multiple keys, partial overlap
2246        let parent = [
2247            StringKey::Share("A".to_string()),
2248            StringKey::Exclusive("B".to_string()),
2249        ];
2250        let child = [
2251            StringKey::Exclusive("A".to_string()), // Conflict with Share("A")
2252            StringKey::Share("B".to_string()),     // Conflict with Exclusive("B")
2253            StringKey::Exclusive("C".to_string()), // No conflict, parent doesn't hold C
2254        ];
2255        let mut conflicts = super::find_lock_conflicts(parent.iter(), child.iter());
2256        conflicts.sort();
2257        assert_eq!(conflicts, vec!["A".to_string(), "B".to_string()]);
2258    }
2259}