diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 9e64eafffc..2c47d6cfe0 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -11,6 +11,8 @@ pub mod deletion_queue; pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; +use std::sync::Arc; + pub use pageserver_api::keyspace; pub mod aux_file; pub mod metrics; @@ -58,6 +60,7 @@ pub use crate::metrics::preinitialize_metrics; pub async fn shutdown_pageserver( tenant_manager: &TenantManager, mut deletion_queue: DeletionQueue, + walredo_global_state: Arc, exit_code: i32, ) { use std::time::Duration; @@ -88,6 +91,16 @@ pub async fn shutdown_pageserver( ) .await; + // The caller of this function already cancelled the `shutdown_pageserver` cancellation token, + // to which all the per-tenant walredo _manager_ methods are sensitive. + // This here is just to make sure the underlying walredo _processes_ are gone. + timed( + walredo_global_state.wait_shutdown_complete().await, + "wait for walredo processes to exit", + Duration::from_secs(1), + ) + .await; + // Best effort to persist any outstanding deletions, to avoid leaking objects deletion_queue.shutdown(Duration::from_secs(5)).await; @@ -104,10 +117,11 @@ pub async fn shutdown_pageserver( // There should be nothing left, but let's be sure timed( task_mgr::shutdown_tasks(None, None, None), - "shutdown leftovers", + "shutdown taskmgr leftovers", Duration::from_secs(1), ) .await; + info!("Shut down successfully completed"); std::process::exit(exit_code); } diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 1cb1623ac2..d7b712bb8e 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -87,6 +87,16 @@ impl GlobalState { pub(self) fn is_shutdown_requested(self: &Arc) -> bool { self.shutdown_bool.load(Ordering::Relaxed) } + pub(crate) fn wait_shutdown_complete(self: &Arc) { + assert!( + self.shutdown.is_cancelled(), + "must cancel the `shutdown` token before waiting, otherwise we will wait forever" + ); + self.spawn_gate.close().await + // The destructor of WalRedoProcess SIGKILLs and `wait()`s for the process + // The gate guard is stored in WalRedoProcess. + // So, we arrive here once all WalRedoProcess structs are gone. + } } /// @@ -310,11 +320,9 @@ impl PostgresRedoManager { .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout) .await .context("apply_wal_records"); - if result.is_err() { - // avoid - if self.global_state.shutdown.is_cancelled() { - return Err(Error::Cancelled); - } + if matches!(result, Err(Error::Cancelled)) { + // bail asap and also avoid log noise due to the error reporting below + return Err(Error::Cancelled); } let duration = started_at.elapsed();