Skip to main content

common_procedure/
local.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15mod runner;
16
17use std::collections::hash_map::Entry;
18use std::collections::{HashMap, HashSet, VecDeque};
19use std::sync::atomic::{AtomicBool, AtomicI64, Ordering};
20use std::sync::{Arc, Mutex, RwLock};
21use std::time::{Duration, Instant};
22
23use async_trait::async_trait;
24use backon::ExponentialBuilder;
25use common_error::ext::BoxedError;
26use common_event_recorder::EventRecorderRef;
27use common_runtime::{JoinHandle, RepeatedTask, TaskFunction};
28use common_telemetry::tracing_context::{FutureExt, TracingContext};
29use common_telemetry::{error, info, tracing};
30use snafu::{OptionExt, ResultExt, ensure};
31use tokio::sync::watch::{self, Receiver, Sender};
32use tokio::sync::{Mutex as TokioMutex, Notify};
33
34use crate::error::{
35    self, CheckStatusSnafu, DuplicateProcedureSnafu, Error, LoaderConflictSnafu,
36    ManagerNotStartSnafu, ManagerPasuedSnafu, PoisonKeyNotDefinedSnafu, ProcedureNotFoundSnafu,
37    Result, StartRemoveOutdatedMetaTaskSnafu, StopRemoveOutdatedMetaTaskSnafu,
38    TooManyRunningProceduresSnafu,
39};
40use crate::event::ProcedureEvent;
41use crate::local::runner::Runner;
42use crate::procedure::{BoxedProcedureLoader, InitProcedureState, PoisonKeys, ProcedureInfo};
43use crate::rwlock::{KeyRwLock, OwnedKeyRwLockGuard};
44use crate::store::poison_store::PoisonStoreRef;
45use crate::store::{ProcedureMessage, ProcedureMessages, ProcedureStore, StateStoreRef};
46use crate::{
47    BoxedProcedure, ContextProvider, LockKey, PoisonKey, ProcedureId, ProcedureManager,
48    ProcedureState, ProcedureWithId, StringKey, UserMetadata, Watcher,
49};
50
51/// The expired time of a procedure's metadata.
52const META_TTL: Duration = Duration::from_secs(60 * 10);
53
54/// Shared metadata of a procedure.
55///
56/// # Note
57/// [Notify] is not a condition variable, we can't guarantee the waiters are notified
58/// if they didn't call `notified()` before we signal the notify. So we
59/// 1. use dedicated notify for each condition, such as waiting for a lock, waiting
60///    for children;
61/// 2. always use `notify_one` and ensure there are only one waiter.
62#[derive(Debug)]
63pub(crate) struct ProcedureMeta {
64    /// Id of this procedure.
65    id: ProcedureId,
66    /// Type name of this procedure.
67    type_name: String,
68    /// Parent procedure id.
69    parent_id: Option<ProcedureId>,
70    /// Notify to wait for subprocedures.
71    child_notify: Notify,
72    /// Lock required by this procedure.
73    lock_key: LockKey,
74    /// Poison keys that may cause this procedure to become poisoned during execution.
75    poison_keys: PoisonKeys,
76    /// Sender to notify the procedure state.
77    state_sender: Sender<ProcedureState>,
78    /// Receiver to watch the procedure state.
79    state_receiver: Receiver<ProcedureState>,
80    /// Id of child procedures.
81    children: Mutex<Vec<ProcedureId>>,
82    /// Start execution time of this procedure.
83    start_time_ms: AtomicI64,
84    /// End execution time of this procedure.
85    end_time_ms: AtomicI64,
86    /// Event recorder.
87    event_recorder: Option<EventRecorderRef>,
88    /// The user metadata of the procedure. It's generated by [Procedure::user_metadata].
89    user_metadata: Option<UserMetadata>,
90}
91
92impl ProcedureMeta {
93    #[allow(clippy::too_many_arguments)]
94    fn new(
95        id: ProcedureId,
96        procedure_state: ProcedureState,
97        parent_id: Option<ProcedureId>,
98        lock_key: LockKey,
99        poison_keys: PoisonKeys,
100        type_name: &str,
101        event_recorder: Option<EventRecorderRef>,
102        user_metadata: Option<UserMetadata>,
103    ) -> ProcedureMeta {
104        let (state_sender, state_receiver) = watch::channel(procedure_state);
105        ProcedureMeta {
106            id,
107            parent_id,
108            child_notify: Notify::new(),
109            lock_key,
110            poison_keys,
111            state_sender,
112            state_receiver,
113            children: Mutex::new(Vec::new()),
114            start_time_ms: AtomicI64::new(0),
115            end_time_ms: AtomicI64::new(0),
116            type_name: type_name.to_string(),
117            event_recorder,
118            user_metadata,
119        }
120    }
121
122    /// Returns current [ProcedureState].
123    fn state(&self) -> ProcedureState {
124        self.state_receiver.borrow().clone()
125    }
126
127    /// Update current [ProcedureState].
128    fn set_state(&self, state: ProcedureState) {
129        // Emit the event to the event recorder if the user metadata contains the eventable object.
130        if let (Some(event_recorder), Some(user_metadata)) =
131            (&self.event_recorder, &self.user_metadata)
132            && let Some(event) = user_metadata.to_event()
133        {
134            event_recorder.record(Box::new(ProcedureEvent::new(self.id, event, state.clone())));
135        }
136
137        // Safety: ProcedureMeta also holds the receiver, so `send()` should never fail.
138        self.state_sender.send(state).unwrap();
139    }
140
141    /// Push `procedure_id` of the subprocedure to the metadata.
142    fn push_child(&self, procedure_id: ProcedureId) {
143        let mut children = self.children.lock().unwrap();
144        children.push(procedure_id);
145    }
146
147    /// Append subprocedures to given `buffer`.
148    fn list_children(&self, buffer: &mut Vec<ProcedureId>) {
149        let children = self.children.lock().unwrap();
150        buffer.extend_from_slice(&children);
151    }
152
153    /// Returns the number of subprocedures.
154    fn num_children(&self) -> usize {
155        self.children.lock().unwrap().len()
156    }
157
158    /// update the start time of the procedure.
159    fn set_start_time_ms(&self) {
160        self.start_time_ms
161            .store(common_time::util::current_time_millis(), Ordering::Relaxed);
162    }
163
164    /// update the end time of the procedure.
165    fn set_end_time_ms(&self) {
166        self.end_time_ms
167            .store(common_time::util::current_time_millis(), Ordering::Relaxed);
168    }
169}
170
171/// Reference counted pointer to [ProcedureMeta].
172type ProcedureMetaRef = Arc<ProcedureMeta>;
173
174/// Procedure loaded from store.
175struct LoadedProcedure {
176    procedure: BoxedProcedure,
177    step: u32,
178}
179
180/// The dynamic lock for procedure execution.
181///
182/// Unlike the procedure-level locks, these locks are acquired dynamically by the procedure
183/// during execution. They are only held when the procedure specifically needs these keys
184/// and are released as soon as the procedure no longer needs them.
185/// This allows for more fine-grained concurrency control during procedure execution.
186pub(crate) type DynamicKeyLock = Arc<KeyRwLock<String>>;
187
188/// Acquires a dynamic key lock for the given key.
189///
190/// This function takes a reference to the dynamic key lock and a pointer to the key.
191/// It then matches the key type and acquires the appropriate lock.
192pub async fn acquire_dynamic_key_lock(
193    lock: &DynamicKeyLock,
194    key: &StringKey,
195) -> DynamicKeyLockGuard {
196    match key {
197        StringKey::Share(key) => {
198            let guard = lock.read(key.clone()).await;
199            DynamicKeyLockGuard {
200                guard: Some(OwnedKeyRwLockGuard::from(guard)),
201                key: key.clone(),
202                lock: lock.clone(),
203            }
204        }
205        StringKey::Exclusive(key) => {
206            let guard = lock.write(key.clone()).await;
207            DynamicKeyLockGuard {
208                guard: Some(OwnedKeyRwLockGuard::from(guard)),
209                key: key.clone(),
210                lock: lock.clone(),
211            }
212        }
213    }
214}
215/// A guard for the dynamic key lock.
216///
217/// This guard is used to release the lock when the procedure no longer needs it.
218/// It also ensures that the lock is cleaned up when the guard is dropped.
219pub struct DynamicKeyLockGuard {
220    guard: Option<OwnedKeyRwLockGuard>,
221    key: String,
222    lock: DynamicKeyLock,
223}
224
225impl Drop for DynamicKeyLockGuard {
226    fn drop(&mut self) {
227        if let Some(guard) = self.guard.take() {
228            drop(guard);
229        }
230        self.lock.clean_keys(std::slice::from_ref(&self.key));
231    }
232}
233
234/// Shared context of the manager.
235pub(crate) struct ManagerContext {
236    /// Procedure loaders. The key is the type name of the procedure which the loader returns.
237    loaders: Mutex<HashMap<String, BoxedProcedureLoader>>,
238    /// The key lock for the procedure.
239    ///
240    /// The lock keys are defined in `Procedure::lock_key()`.
241    /// These locks are acquired before the procedure starts and released after the procedure finishes.
242    /// They ensure exclusive access to resources throughout the entire procedure lifecycle.
243    key_lock: KeyRwLock<String>,
244    /// The dynamic lock for procedure execution.
245    ///
246    /// Unlike the procedure-level locks, these locks are acquired dynamically by the procedure
247    /// during execution. They are only held when the procedure specifically needs these keys
248    /// and are released as soon as the procedure no longer needs them.
249    /// This allows for more fine-grained concurrency control during procedure execution.
250    dynamic_key_lock: DynamicKeyLock,
251    /// Procedures in the manager.
252    procedures: RwLock<HashMap<ProcedureId, ProcedureMetaRef>>,
253    /// Running procedures.
254    running_procedures: Mutex<HashSet<ProcedureId>>,
255    /// Ids and finished time of finished procedures.
256    finished_procedures: Mutex<VecDeque<(ProcedureId, Instant)>>,
257    /// Runner tasks of procedures.
258    runner_tasks: Mutex<HashMap<ProcedureId, JoinHandle<()>>>,
259    /// Running flag.
260    running: Arc<AtomicBool>,
261    /// Poison manager.
262    poison_manager: PoisonStoreRef,
263}
264
265#[async_trait]
266impl ContextProvider for ManagerContext {
267    async fn procedure_state(&self, procedure_id: ProcedureId) -> Result<Option<ProcedureState>> {
268        Ok(self.state(procedure_id))
269    }
270
271    async fn procedure_state_receiver(
272        &self,
273        procedure_id: ProcedureId,
274    ) -> Result<Option<Receiver<ProcedureState>>> {
275        Ok(self.state_receiver(procedure_id))
276    }
277
278    async fn try_put_poison(&self, key: &PoisonKey, procedure_id: ProcedureId) -> Result<()> {
279        {
280            // validate the procedure exists
281            let procedures = self.procedures.read().unwrap();
282            let procedure = procedures
283                .get(&procedure_id)
284                .context(ProcedureNotFoundSnafu { procedure_id })?;
285
286            // validate the poison key is defined
287            ensure!(
288                procedure.poison_keys.contains(key),
289                PoisonKeyNotDefinedSnafu {
290                    key: key.clone(),
291                    procedure_id
292                }
293            );
294        }
295        let key = key.to_string();
296        let procedure_id = procedure_id.to_string();
297        self.poison_manager.try_put_poison(key, procedure_id).await
298    }
299
300    async fn acquire_lock(&self, key: &StringKey) -> DynamicKeyLockGuard {
301        acquire_dynamic_key_lock(&self.dynamic_key_lock, key).await
302    }
303}
304
305impl ManagerContext {
306    /// Returns a new [ManagerContext].
307    fn new(poison_manager: PoisonStoreRef) -> ManagerContext {
308        ManagerContext {
309            key_lock: KeyRwLock::new(),
310            dynamic_key_lock: Arc::new(KeyRwLock::new()),
311            loaders: Mutex::new(HashMap::new()),
312            procedures: RwLock::new(HashMap::new()),
313            running_procedures: Mutex::new(HashSet::new()),
314            finished_procedures: Mutex::new(VecDeque::new()),
315            runner_tasks: Mutex::new(HashMap::new()),
316            running: Arc::new(AtomicBool::new(false)),
317            poison_manager,
318        }
319    }
320
321    #[cfg(test)]
322    pub(crate) fn set_running(&self) {
323        self.running.store(true, Ordering::Relaxed);
324    }
325
326    /// Set the running flag.
327    pub(crate) fn start(&self) {
328        self.running.store(true, Ordering::Relaxed);
329    }
330
331    pub(crate) fn stop(&self) {
332        self.running.store(false, Ordering::Relaxed);
333    }
334
335    fn reset_runtime_state(&self) {
336        self.procedures.write().unwrap().clear();
337        self.running_procedures.lock().unwrap().clear();
338        self.finished_procedures.lock().unwrap().clear();
339        for handle in self
340            .runner_tasks
341            .lock()
342            .unwrap()
343            .drain()
344            .map(|(_, handle)| handle)
345        {
346            handle.abort();
347        }
348        self.key_lock.clear();
349        self.dynamic_key_lock.clear();
350    }
351
352    fn spawn_runner_task<F>(&self, procedure_id: ProcedureId, spawn: F) -> bool
353    where
354        F: FnOnce() -> JoinHandle<()>,
355    {
356        let mut tasks = self.runner_tasks.lock().unwrap();
357        if !self.running() {
358            return false;
359        }
360
361        let handle = spawn();
362        let _ = tasks.insert(procedure_id, handle);
363        true
364    }
365
366    fn remove_procedure(&self, procedure_id: ProcedureId) {
367        self.procedures.write().unwrap().remove(&procedure_id);
368        self.running_procedures
369            .lock()
370            .unwrap()
371            .remove(&procedure_id);
372    }
373
374    pub(crate) fn remove_runner_task(&self, procedure_id: ProcedureId) {
375        let _ = self.runner_tasks.lock().unwrap().remove(&procedure_id);
376    }
377
378    fn take_runner_tasks(&self) -> Vec<JoinHandle<()>> {
379        self.runner_tasks
380            .lock()
381            .unwrap()
382            .drain()
383            .map(|(_, handle)| handle)
384            .collect()
385    }
386
387    async fn abort_runner_tasks(&self) {
388        let handles = self.take_runner_tasks();
389
390        for handle in &handles {
391            handle.abort();
392        }
393
394        for handle in handles {
395            if let Err(e) = handle.await
396                && !e.is_cancelled()
397            {
398                error!(
399                    e; "Procedure runner task exits unexpectedly during stop",
400                );
401            }
402        }
403    }
404
405    /// Return `ProcedureManager` is running.
406    pub(crate) fn running(&self) -> bool {
407        self.running.load(Ordering::Relaxed)
408    }
409
410    /// Returns true if the procedure with specific `procedure_id` exists.
411    fn contains_procedure(&self, procedure_id: ProcedureId) -> bool {
412        let procedures = self.procedures.read().unwrap();
413        procedures.contains_key(&procedure_id)
414    }
415
416    /// Returns the number of running procedures.
417    fn num_running_procedures(&self) -> usize {
418        self.running_procedures.lock().unwrap().len()
419    }
420
421    /// Try to insert the `procedure` to the context if there is no procedure
422    /// with same [ProcedureId].
423    ///
424    /// Returns `false` if there is already a procedure using the same [ProcedureId].
425    fn try_insert_procedure(&self, meta: ProcedureMetaRef) -> bool {
426        let procedure_id = meta.id;
427        let mut procedures = self.procedures.write().unwrap();
428        match procedures.entry(procedure_id) {
429            Entry::Occupied(_) => return false,
430            Entry::Vacant(vacant_entry) => {
431                vacant_entry.insert(meta);
432            }
433        }
434
435        let mut running_procedures = self.running_procedures.lock().unwrap();
436        running_procedures.insert(procedure_id);
437
438        true
439    }
440
441    /// Returns the [ProcedureState] of specific `procedure_id`.
442    fn state(&self, procedure_id: ProcedureId) -> Option<ProcedureState> {
443        let procedures = self.procedures.read().unwrap();
444        procedures.get(&procedure_id).map(|meta| meta.state())
445    }
446
447    /// Returns the [Receiver<ProcedureState>] of specific `procedure_id`.
448    fn state_receiver(&self, procedure_id: ProcedureId) -> Option<Receiver<ProcedureState>> {
449        let procedures = self.procedures.read().unwrap();
450        procedures
451            .get(&procedure_id)
452            .map(|meta| meta.state_receiver.clone())
453    }
454
455    /// Returns the [ProcedureMeta] of all procedures.
456    fn list_procedure(&self) -> Vec<ProcedureInfo> {
457        let procedures = self.procedures.read().unwrap();
458        procedures
459            .values()
460            .map(|meta| ProcedureInfo {
461                id: meta.id,
462                type_name: meta.type_name.clone(),
463                start_time_ms: meta.start_time_ms.load(Ordering::Relaxed),
464                end_time_ms: meta.end_time_ms.load(Ordering::Relaxed),
465                state: meta.state(),
466                lock_keys: meta.lock_key.get_keys(),
467            })
468            .collect()
469    }
470
471    /// Returns the [Watcher] of specific `procedure_id`.
472    fn watcher(&self, procedure_id: ProcedureId) -> Option<Watcher> {
473        let procedures = self.procedures.read().unwrap();
474        procedures
475            .get(&procedure_id)
476            .map(|meta| meta.state_receiver.clone())
477    }
478
479    /// Notify a suspended parent procedure with specific `procedure_id` by its subprocedure.
480    fn notify_by_subprocedure(&self, procedure_id: ProcedureId) {
481        let procedures = self.procedures.read().unwrap();
482        if let Some(meta) = procedures.get(&procedure_id) {
483            meta.child_notify.notify_one();
484        }
485    }
486
487    /// Load procedure from specific [ProcedureMessage].
488    fn load_one_procedure_from_message(
489        &self,
490        procedure_id: ProcedureId,
491        message: &ProcedureMessage,
492    ) -> Option<LoadedProcedure> {
493        let loaders = self.loaders.lock().unwrap();
494        let loader = loaders.get(&message.type_name).or_else(|| {
495            error!(
496                "Loader not found, procedure_id: {}, type_name: {}",
497                procedure_id, message.type_name
498            );
499            None
500        })?;
501
502        let procedure = loader(&message.data)
503            .map_err(|e| {
504                error!(
505                    "Failed to load procedure data, key: {}, source: {:?}",
506                    procedure_id, e
507                );
508                e
509            })
510            .ok()?;
511
512        Some(LoadedProcedure {
513            procedure,
514            step: message.step,
515        })
516    }
517
518    /// Returns all procedures in the tree (including given `root` procedure).
519    ///
520    /// If callers need a consistent view of the tree, they must ensure no new
521    /// procedure is added to the tree during using this method.
522    fn procedures_in_tree(&self, root: &ProcedureMetaRef) -> Vec<ProcedureId> {
523        let sub_num = root.num_children();
524        // Reserve capacity for the root procedure and its children.
525        let mut procedures = Vec::with_capacity(1 + sub_num);
526
527        let mut queue = VecDeque::with_capacity(1 + sub_num);
528        // Push the root procedure to the queue.
529        queue.push_back(root.clone());
530
531        let mut children_ids = Vec::with_capacity(sub_num);
532        let mut children = Vec::with_capacity(sub_num);
533        while let Some(meta) = queue.pop_front() {
534            procedures.push(meta.id);
535
536            // Find metadatas of children.
537            children_ids.clear();
538            meta.list_children(&mut children_ids);
539            self.find_procedures(&children_ids, &mut children);
540
541            // Traverse children later.
542            for child in children.drain(..) {
543                queue.push_back(child);
544            }
545        }
546
547        procedures
548    }
549
550    /// Finds procedures by given `procedure_ids`.
551    ///
552    /// Ignores the id if corresponding procedure is not found.
553    fn find_procedures(&self, procedure_ids: &[ProcedureId], metas: &mut Vec<ProcedureMetaRef>) {
554        let procedures = self.procedures.read().unwrap();
555        for procedure_id in procedure_ids {
556            if let Some(meta) = procedures.get(procedure_id) {
557                metas.push(meta.clone());
558            }
559        }
560    }
561
562    /// Clean resources of finished procedures.
563    fn on_procedures_finish(&self, procedure_ids: &[ProcedureId]) {
564        // Since users need to query the procedure state, so we can't remove the
565        // meta of the procedure directly.
566        let now = Instant::now();
567        let mut finished_procedures = self.finished_procedures.lock().unwrap();
568        finished_procedures.extend(procedure_ids.iter().map(|id| (*id, now)));
569
570        // Remove the procedures from the running set.
571        let mut running_procedures = self.running_procedures.lock().unwrap();
572        for procedure_id in procedure_ids {
573            running_procedures.remove(procedure_id);
574        }
575    }
576
577    /// Remove metadata of outdated procedures.
578    fn remove_outdated_meta(&self, ttl: Duration) {
579        let ids = {
580            let mut finished_procedures = self.finished_procedures.lock().unwrap();
581            if finished_procedures.is_empty() {
582                return;
583            }
584
585            let mut ids_to_remove = Vec::new();
586            while let Some((id, finish_time)) = finished_procedures.front() {
587                if finish_time.elapsed() > ttl {
588                    ids_to_remove.push(*id);
589                    let _ = finished_procedures.pop_front();
590                } else {
591                    // The rest procedures are finished later, so we can break
592                    // the loop.
593                    break;
594                }
595            }
596            ids_to_remove
597        };
598
599        if ids.is_empty() {
600            return;
601        }
602
603        let mut procedures = self.procedures.write().unwrap();
604        for id in ids {
605            let _ = procedures.remove(&id);
606        }
607    }
608}
609
610/// Config for [LocalManager].
611#[derive(Debug)]
612pub struct ManagerConfig {
613    pub parent_path: String,
614    pub max_retry_times: usize,
615    pub retry_delay: Duration,
616    pub remove_outdated_meta_task_interval: Duration,
617    pub remove_outdated_meta_ttl: Duration,
618    pub max_running_procedures: usize,
619}
620
621impl Default for ManagerConfig {
622    fn default() -> Self {
623        Self {
624            parent_path: String::default(),
625            max_retry_times: 3,
626            retry_delay: Duration::from_millis(500),
627            remove_outdated_meta_task_interval: Duration::from_secs(60 * 10),
628            remove_outdated_meta_ttl: META_TTL,
629            max_running_procedures: 128,
630        }
631    }
632}
633
634type PauseAwareRef = Arc<dyn PauseAware>;
635
636#[async_trait]
637pub trait PauseAware: Send + Sync {
638    /// Returns true if the procedure manager is paused.
639    async fn is_paused(&self) -> std::result::Result<bool, BoxedError>;
640}
641
642/// A [ProcedureManager] that maintains procedure states locally.
643pub struct LocalManager {
644    manager_ctx: Arc<ManagerContext>,
645    procedure_store: Arc<ProcedureStore>,
646    max_retry_times: usize,
647    retry_delay: Duration,
648    /// GC task.
649    remove_outdated_meta_task: TokioMutex<Option<RepeatedTask<Error>>>,
650    config: ManagerConfig,
651    pause_aware: Option<PauseAwareRef>,
652    event_recorder: Option<EventRecorderRef>,
653}
654
655impl LocalManager {
656    /// Create a new [LocalManager] with specific `config`.
657    pub fn new(
658        config: ManagerConfig,
659        state_store: StateStoreRef,
660        poison_store: PoisonStoreRef,
661        pause_aware: Option<PauseAwareRef>,
662        event_recorder: Option<EventRecorderRef>,
663    ) -> LocalManager {
664        let manager_ctx = Arc::new(ManagerContext::new(poison_store));
665
666        LocalManager {
667            manager_ctx,
668            procedure_store: Arc::new(ProcedureStore::new(&config.parent_path, state_store)),
669            max_retry_times: config.max_retry_times,
670            retry_delay: config.retry_delay,
671            remove_outdated_meta_task: TokioMutex::new(None),
672            config,
673            pause_aware,
674            event_recorder,
675        }
676    }
677
678    /// Build remove outedated meta task
679    pub fn build_remove_outdated_meta_task(&self) -> RepeatedTask<Error> {
680        RepeatedTask::new(
681            self.config.remove_outdated_meta_task_interval,
682            Box::new(RemoveOutdatedMetaFunction {
683                manager_ctx: self.manager_ctx.clone(),
684                ttl: self.config.remove_outdated_meta_ttl,
685            }),
686        )
687    }
688
689    /// Submit a root procedure with given `procedure_id`.
690    fn submit_root(
691        &self,
692        procedure_id: ProcedureId,
693        procedure_state: ProcedureState,
694        step: u32,
695        procedure: BoxedProcedure,
696    ) -> Result<Watcher> {
697        ensure!(self.manager_ctx.running(), ManagerNotStartSnafu);
698
699        let user_metadata = procedure.user_metadata();
700        let meta = Arc::new(ProcedureMeta::new(
701            procedure_id,
702            procedure_state,
703            None,
704            procedure.lock_key(),
705            procedure.poison_keys(),
706            procedure.type_name(),
707            self.event_recorder.clone(),
708            user_metadata.clone(),
709        ));
710        let runner = Runner {
711            meta: meta.clone(),
712            procedure,
713            manager_ctx: self.manager_ctx.clone(),
714            step,
715            exponential_builder: ExponentialBuilder::default()
716                .with_min_delay(self.retry_delay)
717                .with_max_times(self.max_retry_times),
718            store: self.procedure_store.clone(),
719            rolling_back: false,
720            event_recorder: self.event_recorder.clone(),
721        };
722
723        if let (Some(event_recorder), Some(event)) = (
724            self.event_recorder.as_ref(),
725            user_metadata.and_then(|m| m.to_event()),
726        ) {
727            event_recorder.record(Box::new(ProcedureEvent::new(
728                procedure_id,
729                event,
730                ProcedureState::Running,
731            )));
732        }
733
734        let watcher = meta.state_receiver.clone();
735
736        ensure!(
737            self.manager_ctx.num_running_procedures() < self.config.max_running_procedures,
738            TooManyRunningProceduresSnafu {
739                max_running_procedures: self.config.max_running_procedures,
740            }
741        );
742
743        // Inserts meta into the manager before actually spawnd the runner.
744        ensure!(
745            self.manager_ctx.try_insert_procedure(meta),
746            DuplicateProcedureSnafu { procedure_id },
747        );
748
749        let tracing_context = TracingContext::from_current_span();
750
751        ensure!(
752            self.manager_ctx.spawn_runner_task(procedure_id, || {
753                common_runtime::spawn_global(async move {
754                    let span = tracing_context.attach(tracing::info_span!(
755                    "LocalManager::submit_root_procedure",
756                        procedure_name = %runner.meta.type_name,
757                        procedure_id = %runner.meta.id,
758                    ));
759                    // Run the root procedure.
760                    // The task was moved to another runtime for execution.
761                    // In order not to interrupt tracing, a span needs to be created to continue tracing the current task.
762                    runner.run().trace(span).await;
763                })
764            }),
765            {
766                self.manager_ctx.remove_procedure(procedure_id);
767                ManagerNotStartSnafu
768            }
769        );
770
771        Ok(watcher)
772    }
773
774    fn submit_recovered_messages(
775        &self,
776        messages: HashMap<ProcedureId, ProcedureMessage>,
777        init_state: InitProcedureState,
778    ) {
779        for (procedure_id, message) in &messages {
780            if message.parent_id.is_none() {
781                // This is the root procedure. We only submit the root procedure as it will
782                // submit sub-procedures to the manager.
783                let Some(mut loaded_procedure) = self
784                    .manager_ctx
785                    .load_one_procedure_from_message(*procedure_id, message)
786                else {
787                    // Try to load other procedures.
788                    continue;
789                };
790
791                info!(
792                    "Recover root procedure {}-{}, step: {}",
793                    loaded_procedure.procedure.type_name(),
794                    procedure_id,
795                    loaded_procedure.step
796                );
797
798                let procedure_state = match init_state {
799                    InitProcedureState::RollingBack => ProcedureState::RollingBack {
800                        error: Arc::new(
801                            error::RollbackProcedureRecoveredSnafu {
802                                error: message.error.clone().unwrap_or("Unknown error".to_string()),
803                            }
804                            .build(),
805                        ),
806                    },
807                    InitProcedureState::Running => ProcedureState::Running,
808                };
809
810                if let Err(e) = loaded_procedure.procedure.recover() {
811                    error!(e; "Failed to recover procedure {}", procedure_id);
812                }
813
814                if let Err(e) = self.submit_root(
815                    *procedure_id,
816                    procedure_state,
817                    loaded_procedure.step,
818                    loaded_procedure.procedure,
819                ) {
820                    error!(e; "Failed to recover procedure {}", procedure_id);
821                }
822            }
823        }
824    }
825
826    /// Recovers unfinished procedures and reruns them.
827    async fn recover(&self) -> Result<()> {
828        info!("LocalManager start to recover");
829        let recover_start = Instant::now();
830
831        let ProcedureMessages {
832            messages,
833            rollback_messages,
834            finished_ids,
835        } = self.procedure_store.load_messages().await?;
836        // Submits recovered messages first.
837        self.submit_recovered_messages(rollback_messages, InitProcedureState::RollingBack);
838        self.submit_recovered_messages(messages, InitProcedureState::Running);
839
840        if !finished_ids.is_empty() {
841            info!(
842                "LocalManager try to clean finished procedures, num: {}",
843                finished_ids.len()
844            );
845
846            for procedure_id in finished_ids {
847                if let Err(e) = self.procedure_store.delete_procedure(procedure_id).await {
848                    error!(e; "Failed to delete procedure {}", procedure_id);
849                }
850            }
851        }
852
853        info!(
854            "LocalManager finish recovery, cost: {}ms",
855            recover_start.elapsed().as_millis()
856        );
857
858        Ok(())
859    }
860
861    #[cfg(any(test, feature = "testing"))]
862    /// Returns true if contains a specified loader.
863    pub fn contains_loader(&self, name: &str) -> bool {
864        let loaders = self.manager_ctx.loaders.lock().unwrap();
865        loaders.contains_key(name)
866    }
867
868    async fn check_status(&self) -> Result<()> {
869        if let Some(pause_aware) = self.pause_aware.as_ref() {
870            ensure!(
871                !pause_aware.is_paused().await.context(CheckStatusSnafu)?,
872                ManagerPasuedSnafu
873            );
874        }
875
876        Ok(())
877    }
878}
879
880#[async_trait]
881impl ProcedureManager for LocalManager {
882    fn register_loader(&self, name: &str, loader: BoxedProcedureLoader) -> Result<()> {
883        let mut loaders = self.manager_ctx.loaders.lock().unwrap();
884        ensure!(!loaders.contains_key(name), LoaderConflictSnafu { name });
885
886        let _ = loaders.insert(name.to_string(), loader);
887
888        Ok(())
889    }
890
891    async fn start(&self) -> Result<()> {
892        let mut task = self.remove_outdated_meta_task.lock().await;
893
894        if task.is_some() {
895            return Ok(());
896        }
897
898        let task_inner = self.build_remove_outdated_meta_task();
899
900        task_inner
901            .start(common_runtime::global_runtime())
902            .context(StartRemoveOutdatedMetaTaskSnafu)?;
903
904        *task = Some(task_inner);
905
906        self.manager_ctx.reset_runtime_state();
907        self.manager_ctx.start();
908
909        info!("LocalManager is start.");
910
911        self.recover().await
912    }
913
914    async fn stop(&self) -> Result<()> {
915        self.manager_ctx.stop();
916
917        let mut task = self.remove_outdated_meta_task.lock().await;
918        if let Some(task) = task.take()
919            && let Err(e) = task.stop().await.context(StopRemoveOutdatedMetaTaskSnafu)
920        {
921            error!(e; "Failed to stop remove outdated meta task");
922        };
923
924        self.manager_ctx.abort_runner_tasks().await;
925        self.manager_ctx.reset_runtime_state();
926
927        info!("LocalManager is stopped.");
928
929        Ok(())
930    }
931
932    async fn submit(&self, procedure: ProcedureWithId) -> Result<Watcher> {
933        let procedure_id = procedure.id;
934        ensure!(
935            !self.manager_ctx.contains_procedure(procedure_id),
936            DuplicateProcedureSnafu { procedure_id }
937        );
938        self.check_status().await?;
939
940        self.submit_root(
941            procedure.id,
942            ProcedureState::Running,
943            0,
944            procedure.procedure,
945        )
946    }
947
948    async fn procedure_state(&self, procedure_id: ProcedureId) -> Result<Option<ProcedureState>> {
949        Ok(self.manager_ctx.state(procedure_id))
950    }
951
952    fn procedure_watcher(&self, procedure_id: ProcedureId) -> Option<Watcher> {
953        self.manager_ctx.watcher(procedure_id)
954    }
955
956    async fn list_procedures(&self) -> Result<Vec<ProcedureInfo>> {
957        Ok(self.manager_ctx.list_procedure())
958    }
959}
960
961struct RemoveOutdatedMetaFunction {
962    manager_ctx: Arc<ManagerContext>,
963    ttl: Duration,
964}
965
966#[async_trait::async_trait]
967impl TaskFunction<Error> for RemoveOutdatedMetaFunction {
968    fn name(&self) -> &str {
969        "ProcedureManager-remove-outdated-meta-task"
970    }
971
972    async fn call(&mut self) -> Result<()> {
973        self.manager_ctx.remove_outdated_meta(self.ttl);
974        Ok(())
975    }
976}
977
978/// Create a new [ProcedureMeta] for test purpose.
979#[cfg(test)]
980pub(crate) mod test_util {
981    use common_test_util::temp_dir::TempDir;
982    use object_store::ObjectStore;
983    use object_store::services::Fs as Builder;
984
985    use super::*;
986
987    pub(crate) fn procedure_meta_for_test() -> ProcedureMeta {
988        ProcedureMeta::new(
989            ProcedureId::random(),
990            ProcedureState::Running,
991            None,
992            LockKey::default(),
993            PoisonKeys::default(),
994            "ProcedureAdapter",
995            None,
996            None,
997        )
998    }
999
1000    pub(crate) fn new_object_store(dir: &TempDir) -> ObjectStore {
1001        let store_dir = dir.path().to_str().unwrap();
1002        let builder = Builder::default();
1003        ObjectStore::new(builder.root(store_dir)).unwrap().finish()
1004    }
1005}
1006
1007#[cfg(test)]
1008mod tests {
1009    use std::assert_matches;
1010    use std::sync::atomic::{AtomicBool, Ordering as AtomicOrdering};
1011
1012    use common_error::mock::MockError;
1013    use common_error::status_code::StatusCode;
1014    use common_test_util::temp_dir::create_temp_dir;
1015    use tokio::sync::oneshot;
1016    use tokio::time::timeout;
1017
1018    use super::*;
1019    use crate::error::{self, Error};
1020    use crate::store::state_store::ObjectStateStore;
1021    use crate::test_util::InMemoryPoisonStore;
1022    use crate::{Context, Procedure, Status};
1023
1024    fn new_test_manager_context() -> ManagerContext {
1025        let poison_manager = Arc::new(InMemoryPoisonStore::default());
1026        ManagerContext::new(poison_manager)
1027    }
1028
1029    #[test]
1030    fn test_manager_context() {
1031        let ctx = new_test_manager_context();
1032        let meta = Arc::new(test_util::procedure_meta_for_test());
1033
1034        assert!(!ctx.contains_procedure(meta.id));
1035        assert!(ctx.state(meta.id).is_none());
1036
1037        assert!(ctx.try_insert_procedure(meta.clone()));
1038        assert!(ctx.contains_procedure(meta.id));
1039
1040        assert!(ctx.state(meta.id).unwrap().is_running());
1041        meta.set_state(ProcedureState::Done { output: None });
1042        assert!(ctx.state(meta.id).unwrap().is_done());
1043    }
1044
1045    #[test]
1046    fn test_reset_runtime_state() {
1047        let ctx = new_test_manager_context();
1048        ctx.set_running();
1049        let mut meta = test_util::procedure_meta_for_test();
1050        meta.lock_key = LockKey::single_exclusive("test.reset_runtime_state");
1051        let meta = Arc::new(meta);
1052        let procedure_id = meta.id;
1053
1054        assert!(ctx.try_insert_procedure(meta.clone()));
1055        ctx.finished_procedures
1056            .lock()
1057            .unwrap()
1058            .push_back((procedure_id, Instant::now()));
1059        ctx.spawn_runner_task(procedure_id, || {
1060            common_runtime::spawn_global(std::future::pending::<()>())
1061        });
1062
1063        drop(
1064            ctx.key_lock
1065                .try_write("test.reset_runtime_state".to_string()),
1066        );
1067        drop(
1068            ctx.dynamic_key_lock
1069                .try_write("test.reset_runtime_state.dynamic".to_string()),
1070        );
1071        assert!(ctx.contains_procedure(procedure_id));
1072        assert_eq!(1, ctx.running_procedures.lock().unwrap().len());
1073        assert_eq!(1, ctx.finished_procedures.lock().unwrap().len());
1074        assert_eq!(1, ctx.runner_tasks.lock().unwrap().len());
1075        assert_eq!(1, ctx.key_lock.len());
1076        assert_eq!(1, ctx.dynamic_key_lock.len());
1077
1078        ctx.reset_runtime_state();
1079
1080        assert!(!ctx.contains_procedure(procedure_id));
1081        assert!(ctx.running_procedures.lock().unwrap().is_empty());
1082        assert!(ctx.finished_procedures.lock().unwrap().is_empty());
1083        assert!(ctx.runner_tasks.lock().unwrap().is_empty());
1084        assert!(ctx.key_lock.is_empty());
1085        assert!(ctx.dynamic_key_lock.is_empty());
1086    }
1087
1088    #[test]
1089    fn test_spawn_runner_task_not_started_after_stop() {
1090        let ctx = new_test_manager_context();
1091        let procedure_id = ProcedureId::random();
1092
1093        let spawned = Arc::new(AtomicBool::new(false));
1094        let spawned_in_task = spawned.clone();
1095        let started = ctx.spawn_runner_task(procedure_id, || {
1096            common_runtime::spawn_global(async move {
1097                spawned_in_task.store(true, AtomicOrdering::Relaxed);
1098            })
1099        });
1100
1101        assert!(!started);
1102        assert!(!spawned.load(AtomicOrdering::Relaxed));
1103        assert!(ctx.runner_tasks.lock().unwrap().is_empty());
1104    }
1105
1106    #[test]
1107    fn test_manager_context_insert_duplicate() {
1108        let ctx = new_test_manager_context();
1109        let meta = Arc::new(test_util::procedure_meta_for_test());
1110
1111        assert!(ctx.try_insert_procedure(meta.clone()));
1112        assert!(!ctx.try_insert_procedure(meta));
1113    }
1114
1115    fn new_child(parent_id: ProcedureId, ctx: &ManagerContext) -> ProcedureMetaRef {
1116        let mut child = test_util::procedure_meta_for_test();
1117        child.parent_id = Some(parent_id);
1118        let child = Arc::new(child);
1119        assert!(ctx.try_insert_procedure(child.clone()));
1120
1121        let mut parent = Vec::new();
1122        ctx.find_procedures(&[parent_id], &mut parent);
1123        parent[0].push_child(child.id);
1124
1125        child
1126    }
1127
1128    #[test]
1129    fn test_procedures_in_tree() {
1130        let ctx = new_test_manager_context();
1131        let root = Arc::new(test_util::procedure_meta_for_test());
1132        assert!(ctx.try_insert_procedure(root.clone()));
1133
1134        assert_eq!(1, ctx.procedures_in_tree(&root).len());
1135
1136        let child1 = new_child(root.id, &ctx);
1137        let child2 = new_child(root.id, &ctx);
1138
1139        let child3 = new_child(child1.id, &ctx);
1140        let child4 = new_child(child1.id, &ctx);
1141
1142        let child5 = new_child(child2.id, &ctx);
1143
1144        let expect = vec![
1145            root.id, child1.id, child2.id, child3.id, child4.id, child5.id,
1146        ];
1147        assert_eq!(expect, ctx.procedures_in_tree(&root));
1148    }
1149
1150    #[derive(Debug)]
1151    struct ProcedureToLoad {
1152        content: String,
1153        lock_key: LockKey,
1154        poison_keys: PoisonKeys,
1155    }
1156
1157    #[async_trait]
1158    impl Procedure for ProcedureToLoad {
1159        fn type_name(&self) -> &str {
1160            "ProcedureToLoad"
1161        }
1162
1163        async fn execute(&mut self, _ctx: &Context) -> Result<Status> {
1164            Ok(Status::done())
1165        }
1166
1167        fn dump(&self) -> Result<String> {
1168            Ok(self.content.clone())
1169        }
1170
1171        fn lock_key(&self) -> LockKey {
1172            self.lock_key.clone()
1173        }
1174
1175        fn poison_keys(&self) -> PoisonKeys {
1176            self.poison_keys.clone()
1177        }
1178    }
1179
1180    impl ProcedureToLoad {
1181        fn new(content: &str) -> ProcedureToLoad {
1182            ProcedureToLoad {
1183                content: content.to_string(),
1184                lock_key: LockKey::default(),
1185                poison_keys: PoisonKeys::default(),
1186            }
1187        }
1188
1189        fn loader() -> BoxedProcedureLoader {
1190            let f = |json: &str| {
1191                let procedure = ProcedureToLoad::new(json);
1192                Ok(Box::new(procedure) as _)
1193            };
1194            Box::new(f)
1195        }
1196    }
1197
1198    #[derive(Debug)]
1199    struct BlockingProcedure {
1200        started_tx: Option<oneshot::Sender<()>>,
1201        dropped: Arc<AtomicBool>,
1202        lock_key: LockKey,
1203    }
1204
1205    impl Drop for BlockingProcedure {
1206        fn drop(&mut self) {
1207            self.dropped.store(true, AtomicOrdering::Relaxed);
1208        }
1209    }
1210
1211    #[async_trait]
1212    impl Procedure for BlockingProcedure {
1213        fn type_name(&self) -> &str {
1214            "BlockingProcedure"
1215        }
1216
1217        async fn execute(&mut self, _ctx: &Context) -> Result<Status> {
1218            if let Some(tx) = self.started_tx.take() {
1219                let _ = tx.send(());
1220            }
1221            std::future::pending::<Result<Status>>().await
1222        }
1223
1224        fn dump(&self) -> Result<String> {
1225            Ok(String::new())
1226        }
1227
1228        fn lock_key(&self) -> LockKey {
1229            self.lock_key.clone()
1230        }
1231    }
1232
1233    #[tokio::test]
1234    async fn test_stop_aborts_runner_and_resets_runtime_state() {
1235        let dir = create_temp_dir("stop_aborts_runner_and_resets_runtime_state");
1236        let config = ManagerConfig::default();
1237        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
1238        let poison_manager = Arc::new(InMemoryPoisonStore::new());
1239        let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1240        manager.start().await.unwrap();
1241
1242        let procedure_id = ProcedureId::random();
1243        let (started_tx, started_rx) = oneshot::channel();
1244        let dropped = Arc::new(AtomicBool::new(false));
1245        let procedure = BlockingProcedure {
1246            started_tx: Some(started_tx),
1247            dropped: dropped.clone(),
1248            lock_key: LockKey::single_exclusive("test.stop_aborts_runner"),
1249        };
1250
1251        manager
1252            .submit(ProcedureWithId {
1253                id: procedure_id,
1254                procedure: Box::new(procedure),
1255            })
1256            .await
1257            .unwrap();
1258        timeout(Duration::from_secs(5), started_rx)
1259            .await
1260            .unwrap()
1261            .unwrap();
1262
1263        assert!(manager.manager_ctx.contains_procedure(procedure_id));
1264        assert_eq!(
1265            1,
1266            manager.manager_ctx.running_procedures.lock().unwrap().len()
1267        );
1268        assert_eq!(1, manager.manager_ctx.runner_tasks.lock().unwrap().len());
1269        assert_eq!(1, manager.manager_ctx.key_lock.len());
1270
1271        manager.stop().await.unwrap();
1272
1273        assert!(dropped.load(AtomicOrdering::Relaxed));
1274        assert!(!manager.manager_ctx.running());
1275        assert!(!manager.manager_ctx.contains_procedure(procedure_id));
1276        assert!(
1277            manager
1278                .manager_ctx
1279                .running_procedures
1280                .lock()
1281                .unwrap()
1282                .is_empty()
1283        );
1284        assert!(
1285            manager
1286                .manager_ctx
1287                .finished_procedures
1288                .lock()
1289                .unwrap()
1290                .is_empty()
1291        );
1292        assert!(manager.manager_ctx.runner_tasks.lock().unwrap().is_empty());
1293        assert!(manager.manager_ctx.key_lock.is_empty());
1294        assert!(manager.manager_ctx.dynamic_key_lock.is_empty());
1295    }
1296
1297    #[test]
1298    fn test_register_loader() {
1299        let dir = create_temp_dir("register");
1300        let config = ManagerConfig {
1301            parent_path: "data/".to_string(),
1302            max_retry_times: 3,
1303            retry_delay: Duration::from_millis(500),
1304            ..Default::default()
1305        };
1306        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
1307        let poison_manager = Arc::new(InMemoryPoisonStore::new());
1308        let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1309        manager.manager_ctx.start();
1310
1311        manager
1312            .register_loader("ProcedureToLoad", ProcedureToLoad::loader())
1313            .unwrap();
1314        // Register duplicate loader.
1315        let err = manager
1316            .register_loader("ProcedureToLoad", ProcedureToLoad::loader())
1317            .unwrap_err();
1318        assert!(matches!(err, Error::LoaderConflict { .. }), "{err}");
1319    }
1320
1321    #[tokio::test]
1322    async fn test_recover() {
1323        let dir = create_temp_dir("recover");
1324        let object_store = test_util::new_object_store(&dir);
1325        let config = ManagerConfig {
1326            parent_path: "data/".to_string(),
1327            max_retry_times: 3,
1328            retry_delay: Duration::from_millis(500),
1329            ..Default::default()
1330        };
1331        let state_store = Arc::new(ObjectStateStore::new(object_store.clone()));
1332        let poison_manager = Arc::new(InMemoryPoisonStore::new());
1333        let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1334        manager.manager_ctx.start();
1335
1336        manager
1337            .register_loader("ProcedureToLoad", ProcedureToLoad::loader())
1338            .unwrap();
1339
1340        // Prepare data
1341        let procedure_store = ProcedureStore::from_object_store(object_store.clone());
1342        let root: BoxedProcedure = Box::new(ProcedureToLoad::new("test recover manager"));
1343        let root_id = ProcedureId::random();
1344        // Prepare data for the root procedure.
1345        for step in 0..3 {
1346            let type_name = root.type_name().to_string();
1347            let data = root.dump().unwrap();
1348            procedure_store
1349                .store_procedure(root_id, step, type_name, data, None)
1350                .await
1351                .unwrap();
1352        }
1353
1354        let child: BoxedProcedure = Box::new(ProcedureToLoad::new("a child procedure"));
1355        let child_id = ProcedureId::random();
1356        // Prepare data for the child procedure
1357        for step in 0..2 {
1358            let type_name = child.type_name().to_string();
1359            let data = child.dump().unwrap();
1360            procedure_store
1361                .store_procedure(child_id, step, type_name, data, Some(root_id))
1362                .await
1363                .unwrap();
1364        }
1365
1366        // Recover the manager
1367        manager.recover().await.unwrap();
1368
1369        // The manager should submit the root procedure.
1370        let _ = manager.procedure_state(root_id).await.unwrap().unwrap();
1371        // Since the mocked root procedure actually doesn't submit subprocedures, so there is no
1372        // related state.
1373        assert!(manager.procedure_state(child_id).await.unwrap().is_none());
1374    }
1375
1376    #[tokio::test]
1377    async fn test_submit_procedure() {
1378        let dir = create_temp_dir("submit");
1379        let config = ManagerConfig {
1380            parent_path: "data/".to_string(),
1381            max_retry_times: 3,
1382            retry_delay: Duration::from_millis(500),
1383            ..Default::default()
1384        };
1385        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
1386        let poison_manager = Arc::new(InMemoryPoisonStore::new());
1387        let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1388        manager.manager_ctx.start();
1389
1390        let procedure_id = ProcedureId::random();
1391        assert!(
1392            manager
1393                .procedure_state(procedure_id)
1394                .await
1395                .unwrap()
1396                .is_none()
1397        );
1398        assert!(manager.procedure_watcher(procedure_id).is_none());
1399
1400        let mut procedure = ProcedureToLoad::new("submit");
1401        procedure.lock_key = LockKey::single_exclusive("test.submit");
1402        assert!(
1403            manager
1404                .submit(ProcedureWithId {
1405                    id: procedure_id,
1406                    procedure: Box::new(procedure),
1407                })
1408                .await
1409                .is_ok()
1410        );
1411        assert!(
1412            manager
1413                .procedure_state(procedure_id)
1414                .await
1415                .unwrap()
1416                .is_some()
1417        );
1418        // Wait for the procedure done.
1419        let mut watcher = manager.procedure_watcher(procedure_id).unwrap();
1420        watcher.changed().await.unwrap();
1421        assert!(watcher.borrow().is_done());
1422
1423        // Try to submit procedure with same id again.
1424        let err = manager
1425            .submit(ProcedureWithId {
1426                id: procedure_id,
1427                procedure: Box::new(ProcedureToLoad::new("submit")),
1428            })
1429            .await
1430            .unwrap_err();
1431        assert!(matches!(err, Error::DuplicateProcedure { .. }), "{err}");
1432    }
1433
1434    #[tokio::test]
1435    async fn test_state_changed_on_err() {
1436        let dir = create_temp_dir("on_err");
1437        let config = ManagerConfig {
1438            parent_path: "data/".to_string(),
1439            max_retry_times: 3,
1440            retry_delay: Duration::from_millis(500),
1441            ..Default::default()
1442        };
1443        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
1444        let poison_manager = Arc::new(InMemoryPoisonStore::new());
1445        let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1446        manager.manager_ctx.start();
1447
1448        #[derive(Debug)]
1449        struct MockProcedure {
1450            panic: bool,
1451        }
1452
1453        #[async_trait]
1454        impl Procedure for MockProcedure {
1455            fn type_name(&self) -> &str {
1456                "MockProcedure"
1457            }
1458
1459            async fn execute(&mut self, _ctx: &Context) -> Result<Status> {
1460                if self.panic {
1461                    // Test the runner can set the state to failed even the procedure
1462                    // panics.
1463                    panic!();
1464                } else {
1465                    Err(Error::external(MockError::new(StatusCode::Unexpected)))
1466                }
1467            }
1468
1469            async fn rollback(&mut self, _: &Context) -> Result<()> {
1470                Ok(())
1471            }
1472
1473            fn rollback_supported(&self) -> bool {
1474                true
1475            }
1476
1477            fn dump(&self) -> Result<String> {
1478                Ok(String::new())
1479            }
1480
1481            fn lock_key(&self) -> LockKey {
1482                LockKey::single_exclusive("test.submit")
1483            }
1484
1485            fn poison_keys(&self) -> PoisonKeys {
1486                PoisonKeys::default()
1487            }
1488        }
1489
1490        let check_procedure = |procedure| async {
1491            let procedure_id = ProcedureId::random();
1492            manager
1493                .submit(ProcedureWithId {
1494                    id: procedure_id,
1495                    procedure: Box::new(procedure),
1496                })
1497                .await
1498                .unwrap()
1499        };
1500
1501        let mut watcher = check_procedure(MockProcedure { panic: false }).await;
1502        // Wait for the notification.
1503        watcher.changed().await.unwrap();
1504        assert!(watcher.borrow().is_prepare_rollback());
1505        watcher.changed().await.unwrap();
1506        assert!(watcher.borrow().is_rolling_back());
1507        watcher.changed().await.unwrap();
1508        assert!(watcher.borrow().is_failed());
1509        // The runner won't rollback a panicked procedure.
1510        let mut watcher = check_procedure(MockProcedure { panic: true }).await;
1511        watcher.changed().await.unwrap();
1512        assert!(watcher.borrow().is_failed());
1513    }
1514
1515    #[tokio::test]
1516    async fn test_procedure_manager_stopped() {
1517        let dir = create_temp_dir("procedure_manager_stopped");
1518        let config = ManagerConfig {
1519            parent_path: "data/".to_string(),
1520            max_retry_times: 3,
1521            retry_delay: Duration::from_millis(500),
1522            ..Default::default()
1523        };
1524        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
1525        let poison_manager = Arc::new(InMemoryPoisonStore::new());
1526        let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1527
1528        let mut procedure = ProcedureToLoad::new("submit");
1529        procedure.lock_key = LockKey::single_exclusive("test.submit");
1530        let procedure_id = ProcedureId::random();
1531        assert_matches!(
1532            manager
1533                .submit(ProcedureWithId {
1534                    id: procedure_id,
1535                    procedure: Box::new(procedure),
1536                })
1537                .await
1538                .unwrap_err(),
1539            error::Error::ManagerNotStart { .. }
1540        );
1541    }
1542
1543    #[tokio::test]
1544    async fn test_procedure_manager_restart() {
1545        let dir = create_temp_dir("procedure_manager_restart");
1546        let config = ManagerConfig {
1547            parent_path: "data/".to_string(),
1548            max_retry_times: 3,
1549            retry_delay: Duration::from_millis(500),
1550            ..Default::default()
1551        };
1552        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
1553        let poison_manager = Arc::new(InMemoryPoisonStore::new());
1554        let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1555
1556        manager.start().await.unwrap();
1557        manager.stop().await.unwrap();
1558        manager.start().await.unwrap();
1559
1560        let mut procedure = ProcedureToLoad::new("submit");
1561        procedure.lock_key = LockKey::single_exclusive("test.submit");
1562        let procedure_id = ProcedureId::random();
1563        assert!(
1564            manager
1565                .submit(ProcedureWithId {
1566                    id: procedure_id,
1567                    procedure: Box::new(procedure),
1568                })
1569                .await
1570                .is_ok()
1571        );
1572        assert!(
1573            manager
1574                .procedure_state(procedure_id)
1575                .await
1576                .unwrap()
1577                .is_some()
1578        );
1579    }
1580
1581    #[tokio::test(flavor = "multi_thread")]
1582    async fn test_remove_outdated_meta_task() {
1583        let dir = create_temp_dir("remove_outdated_meta_task");
1584        let object_store = test_util::new_object_store(&dir);
1585        let config = ManagerConfig {
1586            parent_path: "data/".to_string(),
1587            max_retry_times: 3,
1588            retry_delay: Duration::from_millis(500),
1589            remove_outdated_meta_task_interval: Duration::from_millis(1),
1590            remove_outdated_meta_ttl: Duration::from_millis(1),
1591            max_running_procedures: 128,
1592        };
1593        let state_store = Arc::new(ObjectStateStore::new(object_store.clone()));
1594        let poison_manager = Arc::new(InMemoryPoisonStore::new());
1595        let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1596        manager.manager_ctx.set_running();
1597
1598        let mut procedure = ProcedureToLoad::new("submit");
1599        procedure.lock_key = LockKey::single_exclusive("test.submit");
1600        let procedure_id = ProcedureId::random();
1601        assert!(
1602            manager
1603                .submit(ProcedureWithId {
1604                    id: procedure_id,
1605                    procedure: Box::new(procedure),
1606                })
1607                .await
1608                .is_ok()
1609        );
1610        let mut watcher = manager.procedure_watcher(procedure_id).unwrap();
1611        watcher.changed().await.unwrap();
1612
1613        manager.start().await.unwrap();
1614        tokio::time::sleep(Duration::from_millis(300)).await;
1615        assert!(
1616            manager
1617                .procedure_state(procedure_id)
1618                .await
1619                .unwrap()
1620                .is_none()
1621        );
1622
1623        // The remove_outdated_meta method has been stopped, so any procedure meta-data will not be automatically removed.
1624        manager.stop().await.unwrap();
1625        let mut procedure = ProcedureToLoad::new("submit");
1626        procedure.lock_key = LockKey::single_exclusive("test.submit");
1627        let procedure_id = ProcedureId::random();
1628
1629        manager.manager_ctx.set_running();
1630        assert!(
1631            manager
1632                .submit(ProcedureWithId {
1633                    id: procedure_id,
1634                    procedure: Box::new(procedure),
1635                })
1636                .await
1637                .is_ok()
1638        );
1639        let mut watcher = manager.procedure_watcher(procedure_id).unwrap();
1640        watcher.changed().await.unwrap();
1641        tokio::time::sleep(Duration::from_millis(300)).await;
1642        assert!(
1643            manager
1644                .procedure_state(procedure_id)
1645                .await
1646                .unwrap()
1647                .is_some()
1648        );
1649
1650        // After restart
1651        let mut procedure = ProcedureToLoad::new("submit");
1652        procedure.lock_key = LockKey::single_exclusive("test.submit");
1653        let procedure_id = ProcedureId::random();
1654        assert!(
1655            manager
1656                .submit(ProcedureWithId {
1657                    id: procedure_id,
1658                    procedure: Box::new(procedure),
1659                })
1660                .await
1661                .is_ok()
1662        );
1663        let mut watcher = manager.procedure_watcher(procedure_id).unwrap();
1664        watcher.changed().await.unwrap();
1665
1666        manager.start().await.unwrap();
1667        tokio::time::sleep(Duration::from_millis(300)).await;
1668        assert!(
1669            manager
1670                .procedure_state(procedure_id)
1671                .await
1672                .unwrap()
1673                .is_none()
1674        );
1675    }
1676
1677    #[tokio::test]
1678    async fn test_too_many_running_procedures() {
1679        let dir = create_temp_dir("too_many_running_procedures");
1680        let config = ManagerConfig {
1681            parent_path: "data/".to_string(),
1682            max_retry_times: 3,
1683            retry_delay: Duration::from_millis(500),
1684            max_running_procedures: 1,
1685            ..Default::default()
1686        };
1687        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
1688        let poison_manager = Arc::new(InMemoryPoisonStore::new());
1689        let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1690        manager.start().await.unwrap();
1691
1692        manager
1693            .manager_ctx
1694            .running_procedures
1695            .lock()
1696            .unwrap()
1697            .insert(ProcedureId::random());
1698
1699        // Submit a new procedure should fail.
1700        let mut procedure = ProcedureToLoad::new("submit");
1701        procedure.lock_key = LockKey::single_exclusive("test.submit");
1702        let procedure_id = ProcedureId::random();
1703        let err = manager
1704            .submit(ProcedureWithId {
1705                id: procedure_id,
1706                procedure: Box::new(procedure),
1707            })
1708            .await
1709            .unwrap_err();
1710        assert!(matches!(err, Error::TooManyRunningProcedures { .. }));
1711
1712        manager
1713            .manager_ctx
1714            .running_procedures
1715            .lock()
1716            .unwrap()
1717            .clear();
1718
1719        // Submit a new procedure should succeed.
1720        let mut procedure = ProcedureToLoad::new("submit");
1721        procedure.lock_key = LockKey::single_exclusive("test.submit");
1722        assert!(
1723            manager
1724                .submit(ProcedureWithId {
1725                    id: procedure_id,
1726                    procedure: Box::new(procedure),
1727                })
1728                .await
1729                .is_ok()
1730        );
1731        assert!(
1732            manager
1733                .procedure_state(procedure_id)
1734                .await
1735                .unwrap()
1736                .is_some()
1737        );
1738        // Wait for the procedure done.
1739        let mut watcher = manager.procedure_watcher(procedure_id).unwrap();
1740        watcher.changed().await.unwrap();
1741        assert!(watcher.borrow().is_done());
1742    }
1743
1744    #[derive(Debug)]
1745    struct ProcedureToRecover {
1746        content: String,
1747        lock_key: LockKey,
1748        notify: Option<Arc<Notify>>,
1749        poison_keys: PoisonKeys,
1750    }
1751
1752    #[async_trait]
1753    impl Procedure for ProcedureToRecover {
1754        fn type_name(&self) -> &str {
1755            "ProcedureToRecover"
1756        }
1757
1758        async fn execute(&mut self, _ctx: &Context) -> Result<Status> {
1759            Ok(Status::done())
1760        }
1761
1762        fn dump(&self) -> Result<String> {
1763            Ok(self.content.clone())
1764        }
1765
1766        fn lock_key(&self) -> LockKey {
1767            self.lock_key.clone()
1768        }
1769
1770        fn recover(&mut self) -> Result<()> {
1771            self.notify.as_ref().unwrap().notify_one();
1772            Ok(())
1773        }
1774
1775        fn poison_keys(&self) -> PoisonKeys {
1776            self.poison_keys.clone()
1777        }
1778    }
1779
1780    impl ProcedureToRecover {
1781        fn new(content: &str) -> ProcedureToRecover {
1782            ProcedureToRecover {
1783                content: content.to_string(),
1784                lock_key: LockKey::default(),
1785                poison_keys: PoisonKeys::default(),
1786                notify: None,
1787            }
1788        }
1789
1790        fn loader(notify: Arc<Notify>) -> BoxedProcedureLoader {
1791            let f = move |json: &str| {
1792                let procedure = ProcedureToRecover {
1793                    content: json.to_string(),
1794                    lock_key: LockKey::default(),
1795                    poison_keys: PoisonKeys::default(),
1796                    notify: Some(notify.clone()),
1797                };
1798                Ok(Box::new(procedure) as _)
1799            };
1800            Box::new(f)
1801        }
1802    }
1803
1804    #[tokio::test]
1805    async fn test_procedure_recover() {
1806        common_telemetry::init_default_ut_logging();
1807        let dir = create_temp_dir("procedure_recover");
1808        let object_store = test_util::new_object_store(&dir);
1809        let config = ManagerConfig {
1810            parent_path: "data/".to_string(),
1811            max_retry_times: 3,
1812            retry_delay: Duration::from_millis(500),
1813            ..Default::default()
1814        };
1815        let state_store = Arc::new(ObjectStateStore::new(object_store.clone()));
1816        let poison_manager = Arc::new(InMemoryPoisonStore::new());
1817        let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1818        manager.manager_ctx.start();
1819
1820        let notify = Arc::new(Notify::new());
1821        manager
1822            .register_loader(
1823                "ProcedureToRecover",
1824                ProcedureToRecover::loader(notify.clone()),
1825            )
1826            .unwrap();
1827
1828        // Prepare data
1829        let procedure_store = ProcedureStore::from_object_store(object_store.clone());
1830        let root: BoxedProcedure = Box::new(ProcedureToRecover::new("test procedure recovery"));
1831        let root_id = ProcedureId::random();
1832        // Prepare data for the root procedure.
1833        for step in 0..3 {
1834            let type_name = root.type_name().to_string();
1835            let data = root.dump().unwrap();
1836            procedure_store
1837                .store_procedure(root_id, step, type_name, data, None)
1838                .await
1839                .unwrap();
1840        }
1841
1842        // Recover the manager
1843        manager.recover().await.unwrap();
1844        timeout(Duration::from_secs(10), notify.notified())
1845            .await
1846            .unwrap();
1847    }
1848}