diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 190e38e341..c6cb460f8f 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -183,13 +183,8 @@ fn main() -> anyhow::Result<()> { // as a ref. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - // If failpoints are used, terminate the whole pageserver process if they are hit. + // Initialize up failpoints support let scenario = FailScenario::setup(); - if fail::has_failpoints() { - std::panic::set_hook(Box::new(|_| { - std::process::exit(1); - })); - } // Basic initialization of things that don't change after startup virtual_file::init(conf.max_file_descriptors); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 28d6bf2621..03264c9782 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -730,7 +730,18 @@ impl postgres_backend::Handler for PageServerHandler { for failpoint in failpoints.split(';') { if let Some((name, actions)) = failpoint.split_once('=') { info!("cfg failpoint: {} {}", name, actions); - fail::cfg(name, actions).unwrap(); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + if actions == "exit" { + fail::cfg_callback(name, || { + info!("Exit requested by failpoint"); + std::process::exit(1); + }) + .unwrap(); + } else { + fail::cfg(name, actions).unwrap(); + } } else { bail!("Invalid failpoints format"); } diff --git a/test_runner/batch_others/test_recovery.py b/test_runner/batch_others/test_recovery.py index dbfa943a7a..eb1747efa5 100644 --- a/test_runner/batch_others/test_recovery.py +++ b/test_runner/batch_others/test_recovery.py @@ -45,14 +45,14 @@ def test_pageserver_recovery(zenith_env_builder: ZenithEnvBuilder): # Configure failpoints pscur.execute( - "failpoints checkpoint-before-sync=sleep(2000);checkpoint-after-sync=panic") + "failpoints checkpoint-before-sync=sleep(2000);checkpoint-after-sync=exit") # Do some updates until pageserver is crashed try: while True: cur.execute("update foo set x=x+1") except Exception as err: - log.info(f"Excepted server crash {err}") + log.info(f"Expected server crash {err}") log.info("Wait before server restart") env.pageserver.stop()