1mod runner;
16
17use std::collections::hash_map::Entry;
18use std::collections::{HashMap, HashSet, VecDeque};
19use std::sync::atomic::{AtomicBool, AtomicI64, Ordering};
20use std::sync::{Arc, Mutex, RwLock};
21use std::time::{Duration, Instant};
22
23use async_trait::async_trait;
24use backon::ExponentialBuilder;
25use common_error::ext::BoxedError;
26use common_event_recorder::EventRecorderRef;
27use common_runtime::{JoinHandle, RepeatedTask, TaskFunction};
28use common_telemetry::tracing_context::{FutureExt, TracingContext};
29use common_telemetry::{error, info, tracing};
30use snafu::{OptionExt, ResultExt, ensure};
31use tokio::sync::watch::{self, Receiver, Sender};
32use tokio::sync::{Mutex as TokioMutex, Notify};
33
34use crate::error::{
35 self, CheckStatusSnafu, DuplicateProcedureSnafu, Error, LoaderConflictSnafu,
36 ManagerNotStartSnafu, ManagerPasuedSnafu, PoisonKeyNotDefinedSnafu, ProcedureNotFoundSnafu,
37 Result, StartRemoveOutdatedMetaTaskSnafu, StopRemoveOutdatedMetaTaskSnafu,
38 TooManyRunningProceduresSnafu,
39};
40use crate::event::ProcedureEvent;
41use crate::local::runner::Runner;
42use crate::procedure::{BoxedProcedureLoader, InitProcedureState, PoisonKeys, ProcedureInfo};
43use crate::rwlock::{KeyRwLock, OwnedKeyRwLockGuard};
44use crate::store::poison_store::PoisonStoreRef;
45use crate::store::{ProcedureMessage, ProcedureMessages, ProcedureStore, StateStoreRef};
46use crate::{
47 BoxedProcedure, ContextProvider, LockKey, PoisonKey, ProcedureId, ProcedureManager,
48 ProcedureState, ProcedureWithId, StringKey, UserMetadata, Watcher,
49};
50
51const META_TTL: Duration = Duration::from_secs(60 * 10);
53
54#[derive(Debug)]
63pub(crate) struct ProcedureMeta {
64 id: ProcedureId,
66 type_name: String,
68 parent_id: Option<ProcedureId>,
70 child_notify: Notify,
72 lock_key: LockKey,
74 poison_keys: PoisonKeys,
76 state_sender: Sender<ProcedureState>,
78 state_receiver: Receiver<ProcedureState>,
80 children: Mutex<Vec<ProcedureId>>,
82 start_time_ms: AtomicI64,
84 end_time_ms: AtomicI64,
86 event_recorder: Option<EventRecorderRef>,
88 user_metadata: Option<UserMetadata>,
90}
91
92impl ProcedureMeta {
93 #[allow(clippy::too_many_arguments)]
94 fn new(
95 id: ProcedureId,
96 procedure_state: ProcedureState,
97 parent_id: Option<ProcedureId>,
98 lock_key: LockKey,
99 poison_keys: PoisonKeys,
100 type_name: &str,
101 event_recorder: Option<EventRecorderRef>,
102 user_metadata: Option<UserMetadata>,
103 ) -> ProcedureMeta {
104 let (state_sender, state_receiver) = watch::channel(procedure_state);
105 ProcedureMeta {
106 id,
107 parent_id,
108 child_notify: Notify::new(),
109 lock_key,
110 poison_keys,
111 state_sender,
112 state_receiver,
113 children: Mutex::new(Vec::new()),
114 start_time_ms: AtomicI64::new(0),
115 end_time_ms: AtomicI64::new(0),
116 type_name: type_name.to_string(),
117 event_recorder,
118 user_metadata,
119 }
120 }
121
122 fn state(&self) -> ProcedureState {
124 self.state_receiver.borrow().clone()
125 }
126
127 fn set_state(&self, state: ProcedureState) {
129 if let (Some(event_recorder), Some(user_metadata)) =
131 (&self.event_recorder, &self.user_metadata)
132 && let Some(event) = user_metadata.to_event()
133 {
134 event_recorder.record(Box::new(ProcedureEvent::new(self.id, event, state.clone())));
135 }
136
137 self.state_sender.send(state).unwrap();
139 }
140
141 fn push_child(&self, procedure_id: ProcedureId) {
143 let mut children = self.children.lock().unwrap();
144 children.push(procedure_id);
145 }
146
147 fn list_children(&self, buffer: &mut Vec<ProcedureId>) {
149 let children = self.children.lock().unwrap();
150 buffer.extend_from_slice(&children);
151 }
152
153 fn num_children(&self) -> usize {
155 self.children.lock().unwrap().len()
156 }
157
158 fn set_start_time_ms(&self) {
160 self.start_time_ms
161 .store(common_time::util::current_time_millis(), Ordering::Relaxed);
162 }
163
164 fn set_end_time_ms(&self) {
166 self.end_time_ms
167 .store(common_time::util::current_time_millis(), Ordering::Relaxed);
168 }
169}
170
171type ProcedureMetaRef = Arc<ProcedureMeta>;
173
174struct LoadedProcedure {
176 procedure: BoxedProcedure,
177 step: u32,
178}
179
180pub(crate) type DynamicKeyLock = Arc<KeyRwLock<String>>;
187
188pub async fn acquire_dynamic_key_lock(
193 lock: &DynamicKeyLock,
194 key: &StringKey,
195) -> DynamicKeyLockGuard {
196 match key {
197 StringKey::Share(key) => {
198 let guard = lock.read(key.clone()).await;
199 DynamicKeyLockGuard {
200 guard: Some(OwnedKeyRwLockGuard::from(guard)),
201 key: key.clone(),
202 lock: lock.clone(),
203 }
204 }
205 StringKey::Exclusive(key) => {
206 let guard = lock.write(key.clone()).await;
207 DynamicKeyLockGuard {
208 guard: Some(OwnedKeyRwLockGuard::from(guard)),
209 key: key.clone(),
210 lock: lock.clone(),
211 }
212 }
213 }
214}
215pub struct DynamicKeyLockGuard {
220 guard: Option<OwnedKeyRwLockGuard>,
221 key: String,
222 lock: DynamicKeyLock,
223}
224
225impl Drop for DynamicKeyLockGuard {
226 fn drop(&mut self) {
227 if let Some(guard) = self.guard.take() {
228 drop(guard);
229 }
230 self.lock.clean_keys(std::slice::from_ref(&self.key));
231 }
232}
233
234pub(crate) struct ManagerContext {
236 loaders: Mutex<HashMap<String, BoxedProcedureLoader>>,
238 key_lock: KeyRwLock<String>,
244 dynamic_key_lock: DynamicKeyLock,
251 procedures: RwLock<HashMap<ProcedureId, ProcedureMetaRef>>,
253 running_procedures: Mutex<HashSet<ProcedureId>>,
255 finished_procedures: Mutex<VecDeque<(ProcedureId, Instant)>>,
257 runner_tasks: Mutex<HashMap<ProcedureId, JoinHandle<()>>>,
259 running: Arc<AtomicBool>,
261 poison_manager: PoisonStoreRef,
263}
264
265#[async_trait]
266impl ContextProvider for ManagerContext {
267 async fn procedure_state(&self, procedure_id: ProcedureId) -> Result<Option<ProcedureState>> {
268 Ok(self.state(procedure_id))
269 }
270
271 async fn procedure_state_receiver(
272 &self,
273 procedure_id: ProcedureId,
274 ) -> Result<Option<Receiver<ProcedureState>>> {
275 Ok(self.state_receiver(procedure_id))
276 }
277
278 async fn try_put_poison(&self, key: &PoisonKey, procedure_id: ProcedureId) -> Result<()> {
279 {
280 let procedures = self.procedures.read().unwrap();
282 let procedure = procedures
283 .get(&procedure_id)
284 .context(ProcedureNotFoundSnafu { procedure_id })?;
285
286 ensure!(
288 procedure.poison_keys.contains(key),
289 PoisonKeyNotDefinedSnafu {
290 key: key.clone(),
291 procedure_id
292 }
293 );
294 }
295 let key = key.to_string();
296 let procedure_id = procedure_id.to_string();
297 self.poison_manager.try_put_poison(key, procedure_id).await
298 }
299
300 async fn acquire_lock(&self, key: &StringKey) -> DynamicKeyLockGuard {
301 acquire_dynamic_key_lock(&self.dynamic_key_lock, key).await
302 }
303}
304
305impl ManagerContext {
306 fn new(poison_manager: PoisonStoreRef) -> ManagerContext {
308 ManagerContext {
309 key_lock: KeyRwLock::new(),
310 dynamic_key_lock: Arc::new(KeyRwLock::new()),
311 loaders: Mutex::new(HashMap::new()),
312 procedures: RwLock::new(HashMap::new()),
313 running_procedures: Mutex::new(HashSet::new()),
314 finished_procedures: Mutex::new(VecDeque::new()),
315 runner_tasks: Mutex::new(HashMap::new()),
316 running: Arc::new(AtomicBool::new(false)),
317 poison_manager,
318 }
319 }
320
321 #[cfg(test)]
322 pub(crate) fn set_running(&self) {
323 self.running.store(true, Ordering::Relaxed);
324 }
325
326 pub(crate) fn start(&self) {
328 self.running.store(true, Ordering::Relaxed);
329 }
330
331 pub(crate) fn stop(&self) {
332 self.running.store(false, Ordering::Relaxed);
333 }
334
335 fn reset_runtime_state(&self) {
336 self.procedures.write().unwrap().clear();
337 self.running_procedures.lock().unwrap().clear();
338 self.finished_procedures.lock().unwrap().clear();
339 for handle in self
340 .runner_tasks
341 .lock()
342 .unwrap()
343 .drain()
344 .map(|(_, handle)| handle)
345 {
346 handle.abort();
347 }
348 self.key_lock.clear();
349 self.dynamic_key_lock.clear();
350 }
351
352 fn spawn_runner_task<F>(&self, procedure_id: ProcedureId, spawn: F) -> bool
353 where
354 F: FnOnce() -> JoinHandle<()>,
355 {
356 let mut tasks = self.runner_tasks.lock().unwrap();
357 if !self.running() {
358 return false;
359 }
360
361 let handle = spawn();
362 let _ = tasks.insert(procedure_id, handle);
363 true
364 }
365
366 fn remove_procedure(&self, procedure_id: ProcedureId) {
367 self.procedures.write().unwrap().remove(&procedure_id);
368 self.running_procedures
369 .lock()
370 .unwrap()
371 .remove(&procedure_id);
372 }
373
374 pub(crate) fn remove_runner_task(&self, procedure_id: ProcedureId) {
375 let _ = self.runner_tasks.lock().unwrap().remove(&procedure_id);
376 }
377
378 fn take_runner_tasks(&self) -> Vec<JoinHandle<()>> {
379 self.runner_tasks
380 .lock()
381 .unwrap()
382 .drain()
383 .map(|(_, handle)| handle)
384 .collect()
385 }
386
387 async fn abort_runner_tasks(&self) {
388 let handles = self.take_runner_tasks();
389
390 for handle in &handles {
391 handle.abort();
392 }
393
394 for handle in handles {
395 if let Err(e) = handle.await
396 && !e.is_cancelled()
397 {
398 error!(
399 e; "Procedure runner task exits unexpectedly during stop",
400 );
401 }
402 }
403 }
404
405 pub(crate) fn running(&self) -> bool {
407 self.running.load(Ordering::Relaxed)
408 }
409
410 fn contains_procedure(&self, procedure_id: ProcedureId) -> bool {
412 let procedures = self.procedures.read().unwrap();
413 procedures.contains_key(&procedure_id)
414 }
415
416 fn num_running_procedures(&self) -> usize {
418 self.running_procedures.lock().unwrap().len()
419 }
420
421 fn try_insert_procedure(&self, meta: ProcedureMetaRef) -> bool {
426 let procedure_id = meta.id;
427 let mut procedures = self.procedures.write().unwrap();
428 match procedures.entry(procedure_id) {
429 Entry::Occupied(_) => return false,
430 Entry::Vacant(vacant_entry) => {
431 vacant_entry.insert(meta);
432 }
433 }
434
435 let mut running_procedures = self.running_procedures.lock().unwrap();
436 running_procedures.insert(procedure_id);
437
438 true
439 }
440
441 fn state(&self, procedure_id: ProcedureId) -> Option<ProcedureState> {
443 let procedures = self.procedures.read().unwrap();
444 procedures.get(&procedure_id).map(|meta| meta.state())
445 }
446
447 fn state_receiver(&self, procedure_id: ProcedureId) -> Option<Receiver<ProcedureState>> {
449 let procedures = self.procedures.read().unwrap();
450 procedures
451 .get(&procedure_id)
452 .map(|meta| meta.state_receiver.clone())
453 }
454
455 fn list_procedure(&self) -> Vec<ProcedureInfo> {
457 let procedures = self.procedures.read().unwrap();
458 procedures
459 .values()
460 .map(|meta| ProcedureInfo {
461 id: meta.id,
462 type_name: meta.type_name.clone(),
463 start_time_ms: meta.start_time_ms.load(Ordering::Relaxed),
464 end_time_ms: meta.end_time_ms.load(Ordering::Relaxed),
465 state: meta.state(),
466 lock_keys: meta.lock_key.get_keys(),
467 })
468 .collect()
469 }
470
471 fn watcher(&self, procedure_id: ProcedureId) -> Option<Watcher> {
473 let procedures = self.procedures.read().unwrap();
474 procedures
475 .get(&procedure_id)
476 .map(|meta| meta.state_receiver.clone())
477 }
478
479 fn notify_by_subprocedure(&self, procedure_id: ProcedureId) {
481 let procedures = self.procedures.read().unwrap();
482 if let Some(meta) = procedures.get(&procedure_id) {
483 meta.child_notify.notify_one();
484 }
485 }
486
487 fn load_one_procedure_from_message(
489 &self,
490 procedure_id: ProcedureId,
491 message: &ProcedureMessage,
492 ) -> Option<LoadedProcedure> {
493 let loaders = self.loaders.lock().unwrap();
494 let loader = loaders.get(&message.type_name).or_else(|| {
495 error!(
496 "Loader not found, procedure_id: {}, type_name: {}",
497 procedure_id, message.type_name
498 );
499 None
500 })?;
501
502 let procedure = loader(&message.data)
503 .map_err(|e| {
504 error!(
505 "Failed to load procedure data, key: {}, source: {:?}",
506 procedure_id, e
507 );
508 e
509 })
510 .ok()?;
511
512 Some(LoadedProcedure {
513 procedure,
514 step: message.step,
515 })
516 }
517
518 fn procedures_in_tree(&self, root: &ProcedureMetaRef) -> Vec<ProcedureId> {
523 let sub_num = root.num_children();
524 let mut procedures = Vec::with_capacity(1 + sub_num);
526
527 let mut queue = VecDeque::with_capacity(1 + sub_num);
528 queue.push_back(root.clone());
530
531 let mut children_ids = Vec::with_capacity(sub_num);
532 let mut children = Vec::with_capacity(sub_num);
533 while let Some(meta) = queue.pop_front() {
534 procedures.push(meta.id);
535
536 children_ids.clear();
538 meta.list_children(&mut children_ids);
539 self.find_procedures(&children_ids, &mut children);
540
541 for child in children.drain(..) {
543 queue.push_back(child);
544 }
545 }
546
547 procedures
548 }
549
550 fn find_procedures(&self, procedure_ids: &[ProcedureId], metas: &mut Vec<ProcedureMetaRef>) {
554 let procedures = self.procedures.read().unwrap();
555 for procedure_id in procedure_ids {
556 if let Some(meta) = procedures.get(procedure_id) {
557 metas.push(meta.clone());
558 }
559 }
560 }
561
562 fn on_procedures_finish(&self, procedure_ids: &[ProcedureId]) {
564 let now = Instant::now();
567 let mut finished_procedures = self.finished_procedures.lock().unwrap();
568 finished_procedures.extend(procedure_ids.iter().map(|id| (*id, now)));
569
570 let mut running_procedures = self.running_procedures.lock().unwrap();
572 for procedure_id in procedure_ids {
573 running_procedures.remove(procedure_id);
574 }
575 }
576
577 fn remove_outdated_meta(&self, ttl: Duration) {
579 let ids = {
580 let mut finished_procedures = self.finished_procedures.lock().unwrap();
581 if finished_procedures.is_empty() {
582 return;
583 }
584
585 let mut ids_to_remove = Vec::new();
586 while let Some((id, finish_time)) = finished_procedures.front() {
587 if finish_time.elapsed() > ttl {
588 ids_to_remove.push(*id);
589 let _ = finished_procedures.pop_front();
590 } else {
591 break;
594 }
595 }
596 ids_to_remove
597 };
598
599 if ids.is_empty() {
600 return;
601 }
602
603 let mut procedures = self.procedures.write().unwrap();
604 for id in ids {
605 let _ = procedures.remove(&id);
606 }
607 }
608}
609
610#[derive(Debug)]
612pub struct ManagerConfig {
613 pub parent_path: String,
614 pub max_retry_times: usize,
615 pub retry_delay: Duration,
616 pub remove_outdated_meta_task_interval: Duration,
617 pub remove_outdated_meta_ttl: Duration,
618 pub max_running_procedures: usize,
619}
620
621impl Default for ManagerConfig {
622 fn default() -> Self {
623 Self {
624 parent_path: String::default(),
625 max_retry_times: 3,
626 retry_delay: Duration::from_millis(500),
627 remove_outdated_meta_task_interval: Duration::from_secs(60 * 10),
628 remove_outdated_meta_ttl: META_TTL,
629 max_running_procedures: 128,
630 }
631 }
632}
633
634type PauseAwareRef = Arc<dyn PauseAware>;
635
636#[async_trait]
637pub trait PauseAware: Send + Sync {
638 async fn is_paused(&self) -> std::result::Result<bool, BoxedError>;
640}
641
642pub struct LocalManager {
644 manager_ctx: Arc<ManagerContext>,
645 procedure_store: Arc<ProcedureStore>,
646 max_retry_times: usize,
647 retry_delay: Duration,
648 remove_outdated_meta_task: TokioMutex<Option<RepeatedTask<Error>>>,
650 config: ManagerConfig,
651 pause_aware: Option<PauseAwareRef>,
652 event_recorder: Option<EventRecorderRef>,
653}
654
655impl LocalManager {
656 pub fn new(
658 config: ManagerConfig,
659 state_store: StateStoreRef,
660 poison_store: PoisonStoreRef,
661 pause_aware: Option<PauseAwareRef>,
662 event_recorder: Option<EventRecorderRef>,
663 ) -> LocalManager {
664 let manager_ctx = Arc::new(ManagerContext::new(poison_store));
665
666 LocalManager {
667 manager_ctx,
668 procedure_store: Arc::new(ProcedureStore::new(&config.parent_path, state_store)),
669 max_retry_times: config.max_retry_times,
670 retry_delay: config.retry_delay,
671 remove_outdated_meta_task: TokioMutex::new(None),
672 config,
673 pause_aware,
674 event_recorder,
675 }
676 }
677
678 pub fn build_remove_outdated_meta_task(&self) -> RepeatedTask<Error> {
680 RepeatedTask::new(
681 self.config.remove_outdated_meta_task_interval,
682 Box::new(RemoveOutdatedMetaFunction {
683 manager_ctx: self.manager_ctx.clone(),
684 ttl: self.config.remove_outdated_meta_ttl,
685 }),
686 )
687 }
688
689 fn submit_root(
691 &self,
692 procedure_id: ProcedureId,
693 procedure_state: ProcedureState,
694 step: u32,
695 procedure: BoxedProcedure,
696 ) -> Result<Watcher> {
697 ensure!(self.manager_ctx.running(), ManagerNotStartSnafu);
698
699 let user_metadata = procedure.user_metadata();
700 let meta = Arc::new(ProcedureMeta::new(
701 procedure_id,
702 procedure_state,
703 None,
704 procedure.lock_key(),
705 procedure.poison_keys(),
706 procedure.type_name(),
707 self.event_recorder.clone(),
708 user_metadata.clone(),
709 ));
710 let runner = Runner {
711 meta: meta.clone(),
712 procedure,
713 manager_ctx: self.manager_ctx.clone(),
714 step,
715 exponential_builder: ExponentialBuilder::default()
716 .with_min_delay(self.retry_delay)
717 .with_max_times(self.max_retry_times),
718 store: self.procedure_store.clone(),
719 rolling_back: false,
720 event_recorder: self.event_recorder.clone(),
721 };
722
723 if let (Some(event_recorder), Some(event)) = (
724 self.event_recorder.as_ref(),
725 user_metadata.and_then(|m| m.to_event()),
726 ) {
727 event_recorder.record(Box::new(ProcedureEvent::new(
728 procedure_id,
729 event,
730 ProcedureState::Running,
731 )));
732 }
733
734 let watcher = meta.state_receiver.clone();
735
736 ensure!(
737 self.manager_ctx.num_running_procedures() < self.config.max_running_procedures,
738 TooManyRunningProceduresSnafu {
739 max_running_procedures: self.config.max_running_procedures,
740 }
741 );
742
743 ensure!(
745 self.manager_ctx.try_insert_procedure(meta),
746 DuplicateProcedureSnafu { procedure_id },
747 );
748
749 let tracing_context = TracingContext::from_current_span();
750
751 ensure!(
752 self.manager_ctx.spawn_runner_task(procedure_id, || {
753 common_runtime::spawn_global(async move {
754 let span = tracing_context.attach(tracing::info_span!(
755 "LocalManager::submit_root_procedure",
756 procedure_name = %runner.meta.type_name,
757 procedure_id = %runner.meta.id,
758 ));
759 runner.run().trace(span).await;
763 })
764 }),
765 {
766 self.manager_ctx.remove_procedure(procedure_id);
767 ManagerNotStartSnafu
768 }
769 );
770
771 Ok(watcher)
772 }
773
774 fn submit_recovered_messages(
775 &self,
776 messages: HashMap<ProcedureId, ProcedureMessage>,
777 init_state: InitProcedureState,
778 ) {
779 for (procedure_id, message) in &messages {
780 if message.parent_id.is_none() {
781 let Some(mut loaded_procedure) = self
784 .manager_ctx
785 .load_one_procedure_from_message(*procedure_id, message)
786 else {
787 continue;
789 };
790
791 info!(
792 "Recover root procedure {}-{}, step: {}",
793 loaded_procedure.procedure.type_name(),
794 procedure_id,
795 loaded_procedure.step
796 );
797
798 let procedure_state = match init_state {
799 InitProcedureState::RollingBack => ProcedureState::RollingBack {
800 error: Arc::new(
801 error::RollbackProcedureRecoveredSnafu {
802 error: message.error.clone().unwrap_or("Unknown error".to_string()),
803 }
804 .build(),
805 ),
806 },
807 InitProcedureState::Running => ProcedureState::Running,
808 };
809
810 if let Err(e) = loaded_procedure.procedure.recover() {
811 error!(e; "Failed to recover procedure {}", procedure_id);
812 }
813
814 if let Err(e) = self.submit_root(
815 *procedure_id,
816 procedure_state,
817 loaded_procedure.step,
818 loaded_procedure.procedure,
819 ) {
820 error!(e; "Failed to recover procedure {}", procedure_id);
821 }
822 }
823 }
824 }
825
826 async fn recover(&self) -> Result<()> {
828 info!("LocalManager start to recover");
829 let recover_start = Instant::now();
830
831 let ProcedureMessages {
832 messages,
833 rollback_messages,
834 finished_ids,
835 } = self.procedure_store.load_messages().await?;
836 self.submit_recovered_messages(rollback_messages, InitProcedureState::RollingBack);
838 self.submit_recovered_messages(messages, InitProcedureState::Running);
839
840 if !finished_ids.is_empty() {
841 info!(
842 "LocalManager try to clean finished procedures, num: {}",
843 finished_ids.len()
844 );
845
846 for procedure_id in finished_ids {
847 if let Err(e) = self.procedure_store.delete_procedure(procedure_id).await {
848 error!(e; "Failed to delete procedure {}", procedure_id);
849 }
850 }
851 }
852
853 info!(
854 "LocalManager finish recovery, cost: {}ms",
855 recover_start.elapsed().as_millis()
856 );
857
858 Ok(())
859 }
860
861 #[cfg(any(test, feature = "testing"))]
862 pub fn contains_loader(&self, name: &str) -> bool {
864 let loaders = self.manager_ctx.loaders.lock().unwrap();
865 loaders.contains_key(name)
866 }
867
868 async fn check_status(&self) -> Result<()> {
869 if let Some(pause_aware) = self.pause_aware.as_ref() {
870 ensure!(
871 !pause_aware.is_paused().await.context(CheckStatusSnafu)?,
872 ManagerPasuedSnafu
873 );
874 }
875
876 Ok(())
877 }
878}
879
880#[async_trait]
881impl ProcedureManager for LocalManager {
882 fn register_loader(&self, name: &str, loader: BoxedProcedureLoader) -> Result<()> {
883 let mut loaders = self.manager_ctx.loaders.lock().unwrap();
884 ensure!(!loaders.contains_key(name), LoaderConflictSnafu { name });
885
886 let _ = loaders.insert(name.to_string(), loader);
887
888 Ok(())
889 }
890
891 async fn start(&self) -> Result<()> {
892 let mut task = self.remove_outdated_meta_task.lock().await;
893
894 if task.is_some() {
895 return Ok(());
896 }
897
898 let task_inner = self.build_remove_outdated_meta_task();
899
900 task_inner
901 .start(common_runtime::global_runtime())
902 .context(StartRemoveOutdatedMetaTaskSnafu)?;
903
904 *task = Some(task_inner);
905
906 self.manager_ctx.reset_runtime_state();
907 self.manager_ctx.start();
908
909 info!("LocalManager is start.");
910
911 self.recover().await
912 }
913
914 async fn stop(&self) -> Result<()> {
915 self.manager_ctx.stop();
916
917 let mut task = self.remove_outdated_meta_task.lock().await;
918 if let Some(task) = task.take()
919 && let Err(e) = task.stop().await.context(StopRemoveOutdatedMetaTaskSnafu)
920 {
921 error!(e; "Failed to stop remove outdated meta task");
922 };
923
924 self.manager_ctx.abort_runner_tasks().await;
925 self.manager_ctx.reset_runtime_state();
926
927 info!("LocalManager is stopped.");
928
929 Ok(())
930 }
931
932 async fn submit(&self, procedure: ProcedureWithId) -> Result<Watcher> {
933 let procedure_id = procedure.id;
934 ensure!(
935 !self.manager_ctx.contains_procedure(procedure_id),
936 DuplicateProcedureSnafu { procedure_id }
937 );
938 self.check_status().await?;
939
940 self.submit_root(
941 procedure.id,
942 ProcedureState::Running,
943 0,
944 procedure.procedure,
945 )
946 }
947
948 async fn procedure_state(&self, procedure_id: ProcedureId) -> Result<Option<ProcedureState>> {
949 Ok(self.manager_ctx.state(procedure_id))
950 }
951
952 fn procedure_watcher(&self, procedure_id: ProcedureId) -> Option<Watcher> {
953 self.manager_ctx.watcher(procedure_id)
954 }
955
956 async fn list_procedures(&self) -> Result<Vec<ProcedureInfo>> {
957 Ok(self.manager_ctx.list_procedure())
958 }
959}
960
961struct RemoveOutdatedMetaFunction {
962 manager_ctx: Arc<ManagerContext>,
963 ttl: Duration,
964}
965
966#[async_trait::async_trait]
967impl TaskFunction<Error> for RemoveOutdatedMetaFunction {
968 fn name(&self) -> &str {
969 "ProcedureManager-remove-outdated-meta-task"
970 }
971
972 async fn call(&mut self) -> Result<()> {
973 self.manager_ctx.remove_outdated_meta(self.ttl);
974 Ok(())
975 }
976}
977
978#[cfg(test)]
980pub(crate) mod test_util {
981 use common_test_util::temp_dir::TempDir;
982 use object_store::ObjectStore;
983 use object_store::services::Fs as Builder;
984
985 use super::*;
986
987 pub(crate) fn procedure_meta_for_test() -> ProcedureMeta {
988 ProcedureMeta::new(
989 ProcedureId::random(),
990 ProcedureState::Running,
991 None,
992 LockKey::default(),
993 PoisonKeys::default(),
994 "ProcedureAdapter",
995 None,
996 None,
997 )
998 }
999
1000 pub(crate) fn new_object_store(dir: &TempDir) -> ObjectStore {
1001 let store_dir = dir.path().to_str().unwrap();
1002 let builder = Builder::default();
1003 ObjectStore::new(builder.root(store_dir)).unwrap().finish()
1004 }
1005}
1006
1007#[cfg(test)]
1008mod tests {
1009 use std::assert_matches;
1010 use std::sync::atomic::{AtomicBool, Ordering as AtomicOrdering};
1011
1012 use common_error::mock::MockError;
1013 use common_error::status_code::StatusCode;
1014 use common_test_util::temp_dir::create_temp_dir;
1015 use tokio::sync::oneshot;
1016 use tokio::time::timeout;
1017
1018 use super::*;
1019 use crate::error::{self, Error};
1020 use crate::store::state_store::ObjectStateStore;
1021 use crate::test_util::InMemoryPoisonStore;
1022 use crate::{Context, Procedure, Status};
1023
1024 fn new_test_manager_context() -> ManagerContext {
1025 let poison_manager = Arc::new(InMemoryPoisonStore::default());
1026 ManagerContext::new(poison_manager)
1027 }
1028
1029 #[test]
1030 fn test_manager_context() {
1031 let ctx = new_test_manager_context();
1032 let meta = Arc::new(test_util::procedure_meta_for_test());
1033
1034 assert!(!ctx.contains_procedure(meta.id));
1035 assert!(ctx.state(meta.id).is_none());
1036
1037 assert!(ctx.try_insert_procedure(meta.clone()));
1038 assert!(ctx.contains_procedure(meta.id));
1039
1040 assert!(ctx.state(meta.id).unwrap().is_running());
1041 meta.set_state(ProcedureState::Done { output: None });
1042 assert!(ctx.state(meta.id).unwrap().is_done());
1043 }
1044
1045 #[test]
1046 fn test_reset_runtime_state() {
1047 let ctx = new_test_manager_context();
1048 ctx.set_running();
1049 let mut meta = test_util::procedure_meta_for_test();
1050 meta.lock_key = LockKey::single_exclusive("test.reset_runtime_state");
1051 let meta = Arc::new(meta);
1052 let procedure_id = meta.id;
1053
1054 assert!(ctx.try_insert_procedure(meta.clone()));
1055 ctx.finished_procedures
1056 .lock()
1057 .unwrap()
1058 .push_back((procedure_id, Instant::now()));
1059 ctx.spawn_runner_task(procedure_id, || {
1060 common_runtime::spawn_global(std::future::pending::<()>())
1061 });
1062
1063 drop(
1064 ctx.key_lock
1065 .try_write("test.reset_runtime_state".to_string()),
1066 );
1067 drop(
1068 ctx.dynamic_key_lock
1069 .try_write("test.reset_runtime_state.dynamic".to_string()),
1070 );
1071 assert!(ctx.contains_procedure(procedure_id));
1072 assert_eq!(1, ctx.running_procedures.lock().unwrap().len());
1073 assert_eq!(1, ctx.finished_procedures.lock().unwrap().len());
1074 assert_eq!(1, ctx.runner_tasks.lock().unwrap().len());
1075 assert_eq!(1, ctx.key_lock.len());
1076 assert_eq!(1, ctx.dynamic_key_lock.len());
1077
1078 ctx.reset_runtime_state();
1079
1080 assert!(!ctx.contains_procedure(procedure_id));
1081 assert!(ctx.running_procedures.lock().unwrap().is_empty());
1082 assert!(ctx.finished_procedures.lock().unwrap().is_empty());
1083 assert!(ctx.runner_tasks.lock().unwrap().is_empty());
1084 assert!(ctx.key_lock.is_empty());
1085 assert!(ctx.dynamic_key_lock.is_empty());
1086 }
1087
1088 #[test]
1089 fn test_spawn_runner_task_not_started_after_stop() {
1090 let ctx = new_test_manager_context();
1091 let procedure_id = ProcedureId::random();
1092
1093 let spawned = Arc::new(AtomicBool::new(false));
1094 let spawned_in_task = spawned.clone();
1095 let started = ctx.spawn_runner_task(procedure_id, || {
1096 common_runtime::spawn_global(async move {
1097 spawned_in_task.store(true, AtomicOrdering::Relaxed);
1098 })
1099 });
1100
1101 assert!(!started);
1102 assert!(!spawned.load(AtomicOrdering::Relaxed));
1103 assert!(ctx.runner_tasks.lock().unwrap().is_empty());
1104 }
1105
1106 #[test]
1107 fn test_manager_context_insert_duplicate() {
1108 let ctx = new_test_manager_context();
1109 let meta = Arc::new(test_util::procedure_meta_for_test());
1110
1111 assert!(ctx.try_insert_procedure(meta.clone()));
1112 assert!(!ctx.try_insert_procedure(meta));
1113 }
1114
1115 fn new_child(parent_id: ProcedureId, ctx: &ManagerContext) -> ProcedureMetaRef {
1116 let mut child = test_util::procedure_meta_for_test();
1117 child.parent_id = Some(parent_id);
1118 let child = Arc::new(child);
1119 assert!(ctx.try_insert_procedure(child.clone()));
1120
1121 let mut parent = Vec::new();
1122 ctx.find_procedures(&[parent_id], &mut parent);
1123 parent[0].push_child(child.id);
1124
1125 child
1126 }
1127
1128 #[test]
1129 fn test_procedures_in_tree() {
1130 let ctx = new_test_manager_context();
1131 let root = Arc::new(test_util::procedure_meta_for_test());
1132 assert!(ctx.try_insert_procedure(root.clone()));
1133
1134 assert_eq!(1, ctx.procedures_in_tree(&root).len());
1135
1136 let child1 = new_child(root.id, &ctx);
1137 let child2 = new_child(root.id, &ctx);
1138
1139 let child3 = new_child(child1.id, &ctx);
1140 let child4 = new_child(child1.id, &ctx);
1141
1142 let child5 = new_child(child2.id, &ctx);
1143
1144 let expect = vec![
1145 root.id, child1.id, child2.id, child3.id, child4.id, child5.id,
1146 ];
1147 assert_eq!(expect, ctx.procedures_in_tree(&root));
1148 }
1149
1150 #[derive(Debug)]
1151 struct ProcedureToLoad {
1152 content: String,
1153 lock_key: LockKey,
1154 poison_keys: PoisonKeys,
1155 }
1156
1157 #[async_trait]
1158 impl Procedure for ProcedureToLoad {
1159 fn type_name(&self) -> &str {
1160 "ProcedureToLoad"
1161 }
1162
1163 async fn execute(&mut self, _ctx: &Context) -> Result<Status> {
1164 Ok(Status::done())
1165 }
1166
1167 fn dump(&self) -> Result<String> {
1168 Ok(self.content.clone())
1169 }
1170
1171 fn lock_key(&self) -> LockKey {
1172 self.lock_key.clone()
1173 }
1174
1175 fn poison_keys(&self) -> PoisonKeys {
1176 self.poison_keys.clone()
1177 }
1178 }
1179
1180 impl ProcedureToLoad {
1181 fn new(content: &str) -> ProcedureToLoad {
1182 ProcedureToLoad {
1183 content: content.to_string(),
1184 lock_key: LockKey::default(),
1185 poison_keys: PoisonKeys::default(),
1186 }
1187 }
1188
1189 fn loader() -> BoxedProcedureLoader {
1190 let f = |json: &str| {
1191 let procedure = ProcedureToLoad::new(json);
1192 Ok(Box::new(procedure) as _)
1193 };
1194 Box::new(f)
1195 }
1196 }
1197
1198 #[derive(Debug)]
1199 struct BlockingProcedure {
1200 started_tx: Option<oneshot::Sender<()>>,
1201 dropped: Arc<AtomicBool>,
1202 lock_key: LockKey,
1203 }
1204
1205 impl Drop for BlockingProcedure {
1206 fn drop(&mut self) {
1207 self.dropped.store(true, AtomicOrdering::Relaxed);
1208 }
1209 }
1210
1211 #[async_trait]
1212 impl Procedure for BlockingProcedure {
1213 fn type_name(&self) -> &str {
1214 "BlockingProcedure"
1215 }
1216
1217 async fn execute(&mut self, _ctx: &Context) -> Result<Status> {
1218 if let Some(tx) = self.started_tx.take() {
1219 let _ = tx.send(());
1220 }
1221 std::future::pending::<Result<Status>>().await
1222 }
1223
1224 fn dump(&self) -> Result<String> {
1225 Ok(String::new())
1226 }
1227
1228 fn lock_key(&self) -> LockKey {
1229 self.lock_key.clone()
1230 }
1231 }
1232
1233 #[tokio::test]
1234 async fn test_stop_aborts_runner_and_resets_runtime_state() {
1235 let dir = create_temp_dir("stop_aborts_runner_and_resets_runtime_state");
1236 let config = ManagerConfig::default();
1237 let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
1238 let poison_manager = Arc::new(InMemoryPoisonStore::new());
1239 let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1240 manager.start().await.unwrap();
1241
1242 let procedure_id = ProcedureId::random();
1243 let (started_tx, started_rx) = oneshot::channel();
1244 let dropped = Arc::new(AtomicBool::new(false));
1245 let procedure = BlockingProcedure {
1246 started_tx: Some(started_tx),
1247 dropped: dropped.clone(),
1248 lock_key: LockKey::single_exclusive("test.stop_aborts_runner"),
1249 };
1250
1251 manager
1252 .submit(ProcedureWithId {
1253 id: procedure_id,
1254 procedure: Box::new(procedure),
1255 })
1256 .await
1257 .unwrap();
1258 timeout(Duration::from_secs(5), started_rx)
1259 .await
1260 .unwrap()
1261 .unwrap();
1262
1263 assert!(manager.manager_ctx.contains_procedure(procedure_id));
1264 assert_eq!(
1265 1,
1266 manager.manager_ctx.running_procedures.lock().unwrap().len()
1267 );
1268 assert_eq!(1, manager.manager_ctx.runner_tasks.lock().unwrap().len());
1269 assert_eq!(1, manager.manager_ctx.key_lock.len());
1270
1271 manager.stop().await.unwrap();
1272
1273 assert!(dropped.load(AtomicOrdering::Relaxed));
1274 assert!(!manager.manager_ctx.running());
1275 assert!(!manager.manager_ctx.contains_procedure(procedure_id));
1276 assert!(
1277 manager
1278 .manager_ctx
1279 .running_procedures
1280 .lock()
1281 .unwrap()
1282 .is_empty()
1283 );
1284 assert!(
1285 manager
1286 .manager_ctx
1287 .finished_procedures
1288 .lock()
1289 .unwrap()
1290 .is_empty()
1291 );
1292 assert!(manager.manager_ctx.runner_tasks.lock().unwrap().is_empty());
1293 assert!(manager.manager_ctx.key_lock.is_empty());
1294 assert!(manager.manager_ctx.dynamic_key_lock.is_empty());
1295 }
1296
1297 #[test]
1298 fn test_register_loader() {
1299 let dir = create_temp_dir("register");
1300 let config = ManagerConfig {
1301 parent_path: "data/".to_string(),
1302 max_retry_times: 3,
1303 retry_delay: Duration::from_millis(500),
1304 ..Default::default()
1305 };
1306 let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
1307 let poison_manager = Arc::new(InMemoryPoisonStore::new());
1308 let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1309 manager.manager_ctx.start();
1310
1311 manager
1312 .register_loader("ProcedureToLoad", ProcedureToLoad::loader())
1313 .unwrap();
1314 let err = manager
1316 .register_loader("ProcedureToLoad", ProcedureToLoad::loader())
1317 .unwrap_err();
1318 assert!(matches!(err, Error::LoaderConflict { .. }), "{err}");
1319 }
1320
1321 #[tokio::test]
1322 async fn test_recover() {
1323 let dir = create_temp_dir("recover");
1324 let object_store = test_util::new_object_store(&dir);
1325 let config = ManagerConfig {
1326 parent_path: "data/".to_string(),
1327 max_retry_times: 3,
1328 retry_delay: Duration::from_millis(500),
1329 ..Default::default()
1330 };
1331 let state_store = Arc::new(ObjectStateStore::new(object_store.clone()));
1332 let poison_manager = Arc::new(InMemoryPoisonStore::new());
1333 let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1334 manager.manager_ctx.start();
1335
1336 manager
1337 .register_loader("ProcedureToLoad", ProcedureToLoad::loader())
1338 .unwrap();
1339
1340 let procedure_store = ProcedureStore::from_object_store(object_store.clone());
1342 let root: BoxedProcedure = Box::new(ProcedureToLoad::new("test recover manager"));
1343 let root_id = ProcedureId::random();
1344 for step in 0..3 {
1346 let type_name = root.type_name().to_string();
1347 let data = root.dump().unwrap();
1348 procedure_store
1349 .store_procedure(root_id, step, type_name, data, None)
1350 .await
1351 .unwrap();
1352 }
1353
1354 let child: BoxedProcedure = Box::new(ProcedureToLoad::new("a child procedure"));
1355 let child_id = ProcedureId::random();
1356 for step in 0..2 {
1358 let type_name = child.type_name().to_string();
1359 let data = child.dump().unwrap();
1360 procedure_store
1361 .store_procedure(child_id, step, type_name, data, Some(root_id))
1362 .await
1363 .unwrap();
1364 }
1365
1366 manager.recover().await.unwrap();
1368
1369 let _ = manager.procedure_state(root_id).await.unwrap().unwrap();
1371 assert!(manager.procedure_state(child_id).await.unwrap().is_none());
1374 }
1375
1376 #[tokio::test]
1377 async fn test_submit_procedure() {
1378 let dir = create_temp_dir("submit");
1379 let config = ManagerConfig {
1380 parent_path: "data/".to_string(),
1381 max_retry_times: 3,
1382 retry_delay: Duration::from_millis(500),
1383 ..Default::default()
1384 };
1385 let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
1386 let poison_manager = Arc::new(InMemoryPoisonStore::new());
1387 let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1388 manager.manager_ctx.start();
1389
1390 let procedure_id = ProcedureId::random();
1391 assert!(
1392 manager
1393 .procedure_state(procedure_id)
1394 .await
1395 .unwrap()
1396 .is_none()
1397 );
1398 assert!(manager.procedure_watcher(procedure_id).is_none());
1399
1400 let mut procedure = ProcedureToLoad::new("submit");
1401 procedure.lock_key = LockKey::single_exclusive("test.submit");
1402 assert!(
1403 manager
1404 .submit(ProcedureWithId {
1405 id: procedure_id,
1406 procedure: Box::new(procedure),
1407 })
1408 .await
1409 .is_ok()
1410 );
1411 assert!(
1412 manager
1413 .procedure_state(procedure_id)
1414 .await
1415 .unwrap()
1416 .is_some()
1417 );
1418 let mut watcher = manager.procedure_watcher(procedure_id).unwrap();
1420 watcher.changed().await.unwrap();
1421 assert!(watcher.borrow().is_done());
1422
1423 let err = manager
1425 .submit(ProcedureWithId {
1426 id: procedure_id,
1427 procedure: Box::new(ProcedureToLoad::new("submit")),
1428 })
1429 .await
1430 .unwrap_err();
1431 assert!(matches!(err, Error::DuplicateProcedure { .. }), "{err}");
1432 }
1433
1434 #[tokio::test]
1435 async fn test_state_changed_on_err() {
1436 let dir = create_temp_dir("on_err");
1437 let config = ManagerConfig {
1438 parent_path: "data/".to_string(),
1439 max_retry_times: 3,
1440 retry_delay: Duration::from_millis(500),
1441 ..Default::default()
1442 };
1443 let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
1444 let poison_manager = Arc::new(InMemoryPoisonStore::new());
1445 let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1446 manager.manager_ctx.start();
1447
1448 #[derive(Debug)]
1449 struct MockProcedure {
1450 panic: bool,
1451 }
1452
1453 #[async_trait]
1454 impl Procedure for MockProcedure {
1455 fn type_name(&self) -> &str {
1456 "MockProcedure"
1457 }
1458
1459 async fn execute(&mut self, _ctx: &Context) -> Result<Status> {
1460 if self.panic {
1461 panic!();
1464 } else {
1465 Err(Error::external(MockError::new(StatusCode::Unexpected)))
1466 }
1467 }
1468
1469 async fn rollback(&mut self, _: &Context) -> Result<()> {
1470 Ok(())
1471 }
1472
1473 fn rollback_supported(&self) -> bool {
1474 true
1475 }
1476
1477 fn dump(&self) -> Result<String> {
1478 Ok(String::new())
1479 }
1480
1481 fn lock_key(&self) -> LockKey {
1482 LockKey::single_exclusive("test.submit")
1483 }
1484
1485 fn poison_keys(&self) -> PoisonKeys {
1486 PoisonKeys::default()
1487 }
1488 }
1489
1490 let check_procedure = |procedure| async {
1491 let procedure_id = ProcedureId::random();
1492 manager
1493 .submit(ProcedureWithId {
1494 id: procedure_id,
1495 procedure: Box::new(procedure),
1496 })
1497 .await
1498 .unwrap()
1499 };
1500
1501 let mut watcher = check_procedure(MockProcedure { panic: false }).await;
1502 watcher.changed().await.unwrap();
1504 assert!(watcher.borrow().is_prepare_rollback());
1505 watcher.changed().await.unwrap();
1506 assert!(watcher.borrow().is_rolling_back());
1507 watcher.changed().await.unwrap();
1508 assert!(watcher.borrow().is_failed());
1509 let mut watcher = check_procedure(MockProcedure { panic: true }).await;
1511 watcher.changed().await.unwrap();
1512 assert!(watcher.borrow().is_failed());
1513 }
1514
1515 #[tokio::test]
1516 async fn test_procedure_manager_stopped() {
1517 let dir = create_temp_dir("procedure_manager_stopped");
1518 let config = ManagerConfig {
1519 parent_path: "data/".to_string(),
1520 max_retry_times: 3,
1521 retry_delay: Duration::from_millis(500),
1522 ..Default::default()
1523 };
1524 let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
1525 let poison_manager = Arc::new(InMemoryPoisonStore::new());
1526 let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1527
1528 let mut procedure = ProcedureToLoad::new("submit");
1529 procedure.lock_key = LockKey::single_exclusive("test.submit");
1530 let procedure_id = ProcedureId::random();
1531 assert_matches!(
1532 manager
1533 .submit(ProcedureWithId {
1534 id: procedure_id,
1535 procedure: Box::new(procedure),
1536 })
1537 .await
1538 .unwrap_err(),
1539 error::Error::ManagerNotStart { .. }
1540 );
1541 }
1542
1543 #[tokio::test]
1544 async fn test_procedure_manager_restart() {
1545 let dir = create_temp_dir("procedure_manager_restart");
1546 let config = ManagerConfig {
1547 parent_path: "data/".to_string(),
1548 max_retry_times: 3,
1549 retry_delay: Duration::from_millis(500),
1550 ..Default::default()
1551 };
1552 let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
1553 let poison_manager = Arc::new(InMemoryPoisonStore::new());
1554 let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1555
1556 manager.start().await.unwrap();
1557 manager.stop().await.unwrap();
1558 manager.start().await.unwrap();
1559
1560 let mut procedure = ProcedureToLoad::new("submit");
1561 procedure.lock_key = LockKey::single_exclusive("test.submit");
1562 let procedure_id = ProcedureId::random();
1563 assert!(
1564 manager
1565 .submit(ProcedureWithId {
1566 id: procedure_id,
1567 procedure: Box::new(procedure),
1568 })
1569 .await
1570 .is_ok()
1571 );
1572 assert!(
1573 manager
1574 .procedure_state(procedure_id)
1575 .await
1576 .unwrap()
1577 .is_some()
1578 );
1579 }
1580
1581 #[tokio::test(flavor = "multi_thread")]
1582 async fn test_remove_outdated_meta_task() {
1583 let dir = create_temp_dir("remove_outdated_meta_task");
1584 let object_store = test_util::new_object_store(&dir);
1585 let config = ManagerConfig {
1586 parent_path: "data/".to_string(),
1587 max_retry_times: 3,
1588 retry_delay: Duration::from_millis(500),
1589 remove_outdated_meta_task_interval: Duration::from_millis(1),
1590 remove_outdated_meta_ttl: Duration::from_millis(1),
1591 max_running_procedures: 128,
1592 };
1593 let state_store = Arc::new(ObjectStateStore::new(object_store.clone()));
1594 let poison_manager = Arc::new(InMemoryPoisonStore::new());
1595 let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1596 manager.manager_ctx.set_running();
1597
1598 let mut procedure = ProcedureToLoad::new("submit");
1599 procedure.lock_key = LockKey::single_exclusive("test.submit");
1600 let procedure_id = ProcedureId::random();
1601 assert!(
1602 manager
1603 .submit(ProcedureWithId {
1604 id: procedure_id,
1605 procedure: Box::new(procedure),
1606 })
1607 .await
1608 .is_ok()
1609 );
1610 let mut watcher = manager.procedure_watcher(procedure_id).unwrap();
1611 watcher.changed().await.unwrap();
1612
1613 manager.start().await.unwrap();
1614 tokio::time::sleep(Duration::from_millis(300)).await;
1615 assert!(
1616 manager
1617 .procedure_state(procedure_id)
1618 .await
1619 .unwrap()
1620 .is_none()
1621 );
1622
1623 manager.stop().await.unwrap();
1625 let mut procedure = ProcedureToLoad::new("submit");
1626 procedure.lock_key = LockKey::single_exclusive("test.submit");
1627 let procedure_id = ProcedureId::random();
1628
1629 manager.manager_ctx.set_running();
1630 assert!(
1631 manager
1632 .submit(ProcedureWithId {
1633 id: procedure_id,
1634 procedure: Box::new(procedure),
1635 })
1636 .await
1637 .is_ok()
1638 );
1639 let mut watcher = manager.procedure_watcher(procedure_id).unwrap();
1640 watcher.changed().await.unwrap();
1641 tokio::time::sleep(Duration::from_millis(300)).await;
1642 assert!(
1643 manager
1644 .procedure_state(procedure_id)
1645 .await
1646 .unwrap()
1647 .is_some()
1648 );
1649
1650 let mut procedure = ProcedureToLoad::new("submit");
1652 procedure.lock_key = LockKey::single_exclusive("test.submit");
1653 let procedure_id = ProcedureId::random();
1654 assert!(
1655 manager
1656 .submit(ProcedureWithId {
1657 id: procedure_id,
1658 procedure: Box::new(procedure),
1659 })
1660 .await
1661 .is_ok()
1662 );
1663 let mut watcher = manager.procedure_watcher(procedure_id).unwrap();
1664 watcher.changed().await.unwrap();
1665
1666 manager.start().await.unwrap();
1667 tokio::time::sleep(Duration::from_millis(300)).await;
1668 assert!(
1669 manager
1670 .procedure_state(procedure_id)
1671 .await
1672 .unwrap()
1673 .is_none()
1674 );
1675 }
1676
1677 #[tokio::test]
1678 async fn test_too_many_running_procedures() {
1679 let dir = create_temp_dir("too_many_running_procedures");
1680 let config = ManagerConfig {
1681 parent_path: "data/".to_string(),
1682 max_retry_times: 3,
1683 retry_delay: Duration::from_millis(500),
1684 max_running_procedures: 1,
1685 ..Default::default()
1686 };
1687 let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
1688 let poison_manager = Arc::new(InMemoryPoisonStore::new());
1689 let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1690 manager.start().await.unwrap();
1691
1692 manager
1693 .manager_ctx
1694 .running_procedures
1695 .lock()
1696 .unwrap()
1697 .insert(ProcedureId::random());
1698
1699 let mut procedure = ProcedureToLoad::new("submit");
1701 procedure.lock_key = LockKey::single_exclusive("test.submit");
1702 let procedure_id = ProcedureId::random();
1703 let err = manager
1704 .submit(ProcedureWithId {
1705 id: procedure_id,
1706 procedure: Box::new(procedure),
1707 })
1708 .await
1709 .unwrap_err();
1710 assert!(matches!(err, Error::TooManyRunningProcedures { .. }));
1711
1712 manager
1713 .manager_ctx
1714 .running_procedures
1715 .lock()
1716 .unwrap()
1717 .clear();
1718
1719 let mut procedure = ProcedureToLoad::new("submit");
1721 procedure.lock_key = LockKey::single_exclusive("test.submit");
1722 assert!(
1723 manager
1724 .submit(ProcedureWithId {
1725 id: procedure_id,
1726 procedure: Box::new(procedure),
1727 })
1728 .await
1729 .is_ok()
1730 );
1731 assert!(
1732 manager
1733 .procedure_state(procedure_id)
1734 .await
1735 .unwrap()
1736 .is_some()
1737 );
1738 let mut watcher = manager.procedure_watcher(procedure_id).unwrap();
1740 watcher.changed().await.unwrap();
1741 assert!(watcher.borrow().is_done());
1742 }
1743
1744 #[derive(Debug)]
1745 struct ProcedureToRecover {
1746 content: String,
1747 lock_key: LockKey,
1748 notify: Option<Arc<Notify>>,
1749 poison_keys: PoisonKeys,
1750 }
1751
1752 #[async_trait]
1753 impl Procedure for ProcedureToRecover {
1754 fn type_name(&self) -> &str {
1755 "ProcedureToRecover"
1756 }
1757
1758 async fn execute(&mut self, _ctx: &Context) -> Result<Status> {
1759 Ok(Status::done())
1760 }
1761
1762 fn dump(&self) -> Result<String> {
1763 Ok(self.content.clone())
1764 }
1765
1766 fn lock_key(&self) -> LockKey {
1767 self.lock_key.clone()
1768 }
1769
1770 fn recover(&mut self) -> Result<()> {
1771 self.notify.as_ref().unwrap().notify_one();
1772 Ok(())
1773 }
1774
1775 fn poison_keys(&self) -> PoisonKeys {
1776 self.poison_keys.clone()
1777 }
1778 }
1779
1780 impl ProcedureToRecover {
1781 fn new(content: &str) -> ProcedureToRecover {
1782 ProcedureToRecover {
1783 content: content.to_string(),
1784 lock_key: LockKey::default(),
1785 poison_keys: PoisonKeys::default(),
1786 notify: None,
1787 }
1788 }
1789
1790 fn loader(notify: Arc<Notify>) -> BoxedProcedureLoader {
1791 let f = move |json: &str| {
1792 let procedure = ProcedureToRecover {
1793 content: json.to_string(),
1794 lock_key: LockKey::default(),
1795 poison_keys: PoisonKeys::default(),
1796 notify: Some(notify.clone()),
1797 };
1798 Ok(Box::new(procedure) as _)
1799 };
1800 Box::new(f)
1801 }
1802 }
1803
1804 #[tokio::test]
1805 async fn test_procedure_recover() {
1806 common_telemetry::init_default_ut_logging();
1807 let dir = create_temp_dir("procedure_recover");
1808 let object_store = test_util::new_object_store(&dir);
1809 let config = ManagerConfig {
1810 parent_path: "data/".to_string(),
1811 max_retry_times: 3,
1812 retry_delay: Duration::from_millis(500),
1813 ..Default::default()
1814 };
1815 let state_store = Arc::new(ObjectStateStore::new(object_store.clone()));
1816 let poison_manager = Arc::new(InMemoryPoisonStore::new());
1817 let manager = LocalManager::new(config, state_store, poison_manager, None, None);
1818 manager.manager_ctx.start();
1819
1820 let notify = Arc::new(Notify::new());
1821 manager
1822 .register_loader(
1823 "ProcedureToRecover",
1824 ProcedureToRecover::loader(notify.clone()),
1825 )
1826 .unwrap();
1827
1828 let procedure_store = ProcedureStore::from_object_store(object_store.clone());
1830 let root: BoxedProcedure = Box::new(ProcedureToRecover::new("test procedure recovery"));
1831 let root_id = ProcedureId::random();
1832 for step in 0..3 {
1834 let type_name = root.type_name().to_string();
1835 let data = root.dump().unwrap();
1836 procedure_store
1837 .store_procedure(root_id, step, type_name, data, None)
1838 .await
1839 .unwrap();
1840 }
1841
1842 manager.recover().await.unwrap();
1844 timeout(Duration::from_secs(10), notify.notified())
1845 .await
1846 .unwrap();
1847 }
1848}