From ce0d0a204ce9f77d6f7fe23b2bd2f393a75c7b6b Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 12 Aug 2024 19:15:48 +0200 Subject: [PATCH] fix(walredo): shutdown can complete too early (#8701) Problem ------- The following race is possible today: ``` walredo_extraordinary_shutdown_thread: shutdown gets until Poll::Pending of self.launched_processes.close().await call other thread: drops the last Arc = 1. drop(_launched_processes_guard) runs, this ... walredo_extraordinary_shutdown_thread: ... wakes self.launched_processes.close().await walredo_extraordinary_shutdown_thread: logs `done` other thread: = 2. drop(process): this kill & waits ``` Solution -------- Change drop order so that `process` gets dropped first. Context ------- https://neondb.slack.com/archives/C06Q661FA4C/p1723478188785719?thread_ts=1723456706.465789&cid=C06Q661FA4C refs https://github.com/neondatabase/neon/pull/8572 refs https://github.com/neondatabase/cloud/issues/11387 --- pageserver/src/walredo.rs | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 770081b3b4..82585f9ed8 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -107,8 +107,10 @@ enum ProcessOnceCell { } struct Process { - _launched_processes_guard: utils::sync::gate::GateGuard, process: process::WalRedoProcess, + /// This field is last in this struct so the guard gets dropped _after_ [`Self::process`]. + /// (Reminder: dropping [`Self::process`] synchronously sends SIGKILL and then `wait()`s for it to exit). + _launched_processes_guard: utils::sync::gate::GateGuard, } impl std::ops::Deref for Process { @@ -327,20 +329,23 @@ impl PostgresRedoManager { }, Err(permit) => { let start = Instant::now(); - let proc = Arc::new(Process { - _launched_processes_guard: match self.launched_processes.enter() { + // acquire guard before spawning process, so that we don't spawn new processes + // if the gate is already closed. + let _launched_processes_guard = match self.launched_processes.enter() { Ok(guard) => guard, Err(GateError::GateClosed) => unreachable!( "shutdown sets the once cell to `ManagerShutDown` state before closing the gate" ), - }, - process: process::WalRedoProcess::launch( - self.conf, - self.tenant_shard_id, - pg_version, - ) - .context("launch walredo process")?, - }); + }; + let proc = Arc::new(Process { + process: process::WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) + .context("launch walredo process")?, + _launched_processes_guard, + }); let duration = start.elapsed(); WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); info!(