Don't swallow panics when the pageserver is build with failpoints.

It's very confusing, and because you don't get a stack trace and error
message in the logs, makes debugging very hard. However, the
'test_pageserver_recovery' test relied on that behavior. To support that,
add a new "exit" action to the pageserver 'failpoints' command, so that
you can explicitly request to exit the process when a failpoint is hit.
This commit is contained in:
Heikki Linnakangas
2022-05-16 09:58:58 +03:00
parent a10cac980f
commit 51ea9c3053
3 changed files with 15 additions and 9 deletions

View File

@@ -183,13 +183,8 @@ fn main() -> anyhow::Result<()> {
// as a ref.
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
// If failpoints are used, terminate the whole pageserver process if they are hit.
// Initialize up failpoints support
let scenario = FailScenario::setup();
if fail::has_failpoints() {
std::panic::set_hook(Box::new(|_| {
std::process::exit(1);
}));
}
// Basic initialization of things that don't change after startup
virtual_file::init(conf.max_file_descriptors);

View File

@@ -730,7 +730,18 @@ impl postgres_backend::Handler for PageServerHandler {
for failpoint in failpoints.split(';') {
if let Some((name, actions)) = failpoint.split_once('=') {
info!("cfg failpoint: {} {}", name, actions);
fail::cfg(name, actions).unwrap();
// We recognize one extra "action" that's not natively recognized
// by the failpoints crate: exit, to immediately kill the process
if actions == "exit" {
fail::cfg_callback(name, || {
info!("Exit requested by failpoint");
std::process::exit(1);
})
.unwrap();
} else {
fail::cfg(name, actions).unwrap();
}
} else {
bail!("Invalid failpoints format");
}

View File

@@ -45,14 +45,14 @@ def test_pageserver_recovery(zenith_env_builder: ZenithEnvBuilder):
# Configure failpoints
pscur.execute(
"failpoints checkpoint-before-sync=sleep(2000);checkpoint-after-sync=panic")
"failpoints checkpoint-before-sync=sleep(2000);checkpoint-after-sync=exit")
# Do some updates until pageserver is crashed
try:
while True:
cur.execute("update foo set x=x+1")
except Exception as err:
log.info(f"Excepted server crash {err}")
log.info(f"Expected server crash {err}")
log.info("Wait before server restart")
env.pageserver.stop()