From b52389f2280e082134643edf75f250ee19002c92 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 20 Mar 2023 22:29:52 +0400 Subject: [PATCH] Cleanly exit on any shutdown signal in storage_broker. neon_local sends SIGQUIT, which otherwise dumps core by default. Also, remove obsolete install_shutdown_handlers; in all binaries it was overridden by ShutdownSignals::handle later. ref https://github.com/neondatabase/neon/issues/3847 --- libs/utils/src/signals.rs | 23 +---------------------- pageserver/src/bin/pageserver.rs | 11 +++-------- safekeeper/src/bin/safekeeper.rs | 17 +++++++---------- storage_broker/src/bin/storage_broker.rs | 9 +++++++++ 4 files changed, 20 insertions(+), 40 deletions(-) diff --git a/libs/utils/src/signals.rs b/libs/utils/src/signals.rs index 6586da2339..c37e9aea58 100644 --- a/libs/utils/src/signals.rs +++ b/libs/utils/src/signals.rs @@ -1,25 +1,7 @@ -use signal_hook::flag; use signal_hook::iterator::Signals; -use std::sync::atomic::AtomicBool; -use std::sync::Arc; pub use signal_hook::consts::{signal::*, TERM_SIGNALS}; -pub fn install_shutdown_handlers() -> anyhow::Result { - let term_now = Arc::new(AtomicBool::new(false)); - for sig in TERM_SIGNALS { - // When terminated by a second term signal, exit with exit code 1. - // This will do nothing the first time (because term_now is false). - flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?; - // But this will "arm" the above for the second time, by setting it to true. - // The order of registering these is important, if you put this one first, it will - // first arm and then terminate ‒ all in the first round. - flag::register(*sig, Arc::clone(&term_now))?; - } - - Ok(ShutdownSignals) -} - pub enum Signal { Quit, Interrupt, @@ -39,10 +21,7 @@ impl Signal { pub struct ShutdownSignals; impl ShutdownSignals { - pub fn handle( - self, - mut handler: impl FnMut(Signal) -> anyhow::Result<()>, - ) -> anyhow::Result<()> { + pub fn handle(mut handler: impl FnMut(Signal) -> anyhow::Result<()>) -> anyhow::Result<()> { for raw_signal in Signals::new(TERM_SIGNALS)?.into_iter() { let signal = match raw_signal { SIGINT => Signal::Interrupt, diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 14e86ddcb6..cbfd3e1165 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -24,11 +24,9 @@ use pageserver::{ virtual_file, }; use postgres_backend::AuthType; +use utils::signals::ShutdownSignals; use utils::{ - auth::JwtAuth, - logging, project_git_version, - sentry_init::init_sentry, - signals::{self, Signal}, + auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal, tcp_listener, }; @@ -263,9 +261,6 @@ fn start_pageserver( info!("Starting pageserver pg protocol handler on {pg_addr}"); let pageserver_listener = tcp_listener::bind(pg_addr)?; - // Install signal handlers - let signals = signals::install_shutdown_handlers()?; - // Launch broker client WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?; @@ -409,7 +404,7 @@ fn start_pageserver( } // All started up! Now just sit and wait for shutdown signal. - signals.handle(|signal| match signal { + ShutdownSignals::handle(|signal| match signal { Signal::Quit => { info!( "Got {}. Terminating in immediate shutdown mode", diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 8966e8c49b..ace921a26d 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -5,6 +5,7 @@ use anyhow::{bail, Context, Result}; use clap::Parser; use remote_storage::RemoteStorageConfig; use toml_edit::Document; +use utils::signals::ShutdownSignals; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; @@ -39,7 +40,7 @@ use utils::{ logging::{self, LogFormat}, project_git_version, sentry_init::init_sentry, - signals, tcp_listener, + tcp_listener, }; const PID_FILE_NAME: &str = "safekeeper.pid"; @@ -216,7 +217,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let timeline_collector = safekeeper::metrics::TimelineCollector::new(); metrics::register_internal(Box::new(timeline_collector))?; - let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); @@ -274,15 +274,12 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { set_build_info_metric(GIT_VERSION); // TODO: put more thoughts into handling of failed threads - // We probably should restart them. + // We should catch & die if they are in trouble. - // NOTE: we still have to handle signals like SIGQUIT to prevent coredumps - signals.handle(|signal| { - // TODO: implement graceful shutdown with joining threads etc - info!( - "received {}, terminating in immediate shutdown mode", - signal.name() - ); + // On any shutdown signal, log receival and exit. Additionally, handling + // SIGQUIT prevents coredump. + ShutdownSignals::handle(|signal| { + info!("received {}, terminating", signal.name()); std::process::exit(0); }) } diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 1a0d261184..57f975b0df 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -33,6 +33,7 @@ use tonic::transport::server::Connected; use tonic::Code; use tonic::{Request, Response, Status}; use tracing::*; +use utils::signals::ShutdownSignals; use metrics::{Encoder, TextEncoder}; use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE}; @@ -437,6 +438,14 @@ async fn main() -> Result<(), Box> { info!("version: {GIT_VERSION}"); ::metrics::set_build_info_metric(GIT_VERSION); + // On any shutdown signal, log receival and exit. + std::thread::spawn(move || { + ShutdownSignals::handle(|signal| { + info!("received {}, terminating", signal.name()); + std::process::exit(0); + }) + }); + let registry = Registry { shared_state: Arc::new(RwLock::new(SharedState::new(args.all_keys_chan_size))), timeline_chan_size: args.timeline_chan_size,