From f312c6571f45395f4a5adfb2b0450741c16ebd58 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 3 Dec 2024 19:47:17 +0100 Subject: [PATCH] pageserver: respond to multiple shutdown signals (#9982) ## Problem The Pageserver signal handler would only respond to a single signal and initiate shutdown. Subsequent signals were ignored. This meant that a `SIGQUIT` sent after a `SIGTERM` had no effect (e.g. in the case of a slow or stalled shutdown). The `test_runner` uses this to force shutdown if graceful shutdown is slow. Touches #9740. ## Summary of changes Keep responding to signals after the initial shutdown signal has been received. Arguably, the `test_runner` should also use `SIGKILL` rather than `SIGQUIT` in this case, but it seems reasonable to respond to `SIGQUIT` regardless. --- pageserver/src/bin/pageserver.rs | 76 +++++++++++++++++++------------- 1 file changed, 45 insertions(+), 31 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 8fe225c6aa..567a69da3b 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -636,45 +636,59 @@ fn start_pageserver( tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")? }); - let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); - // All started up! Now just sit and wait for shutdown signal. + BACKGROUND_RUNTIME.block_on(async move { + let signal_token = CancellationToken::new(); + let signal_cancel = signal_token.child_token(); - { - BACKGROUND_RUNTIME.block_on(async move { + // Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals + // even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See: + // https://github.com/neondatabase/neon/issues/9740. + tokio::spawn(async move { let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap(); let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap(); let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap(); - let signal = tokio::select! { - _ = sigquit.recv() => { - info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",); - std::process::exit(111); + + loop { + let signal = tokio::select! { + _ = sigquit.recv() => { + info!("Got signal SIGQUIT. Terminating in immediate shutdown mode."); + std::process::exit(111); + } + _ = sigint.recv() => "SIGINT", + _ = sigterm.recv() => "SIGTERM", + }; + + if !signal_token.is_cancelled() { + info!("Got signal {signal}. Terminating gracefully in fast shutdown mode."); + signal_token.cancel(); + } else { + info!("Got signal {signal}. Already shutting down."); } - _ = sigint.recv() => { "SIGINT" }, - _ = sigterm.recv() => { "SIGTERM" }, - }; + } + }); - info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); + // Wait for cancellation signal and shut down the pageserver. + // + // This cancels the `shutdown_pageserver` cancellation tree. Right now that tree doesn't + // reach very far, and `task_mgr` is used instead. The plan is to change that over time. + signal_cancel.cancelled().await; - // This cancels the `shutdown_pageserver` cancellation tree. - // Right now that tree doesn't reach very far, and `task_mgr` is used instead. - // The plan is to change that over time. - shutdown_pageserver.take(); - pageserver::shutdown_pageserver( - http_endpoint_listener, - page_service, - consumption_metrics_tasks, - disk_usage_eviction_task, - &tenant_manager, - background_purges, - deletion_queue.clone(), - secondary_controller_tasks, - 0, - ) - .await; - unreachable!() - }) - } + shutdown_pageserver.cancel(); + pageserver::shutdown_pageserver( + http_endpoint_listener, + page_service, + consumption_metrics_tasks, + disk_usage_eviction_task, + &tenant_manager, + background_purges, + deletion_queue.clone(), + secondary_controller_tasks, + 0, + ) + .await; + unreachable!(); + }) } async fn create_remote_storage_client(