mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-09 14:32:57 +00:00
Don't swallow panics when the pageserver is build with failpoints.
It's very confusing, and because you don't get a stack trace and error message in the logs, makes debugging very hard. However, the 'test_pageserver_recovery' test relied on that behavior. To support that, add a new "exit" action to the pageserver 'failpoints' command, so that you can explicitly request to exit the process when a failpoint is hit.
This commit is contained in:
@@ -183,13 +183,8 @@ fn main() -> anyhow::Result<()> {
|
||||
// as a ref.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
// If failpoints are used, terminate the whole pageserver process if they are hit.
|
||||
// Initialize up failpoints support
|
||||
let scenario = FailScenario::setup();
|
||||
if fail::has_failpoints() {
|
||||
std::panic::set_hook(Box::new(|_| {
|
||||
std::process::exit(1);
|
||||
}));
|
||||
}
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(conf.max_file_descriptors);
|
||||
|
||||
@@ -730,7 +730,18 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
for failpoint in failpoints.split(';') {
|
||||
if let Some((name, actions)) = failpoint.split_once('=') {
|
||||
info!("cfg failpoint: {} {}", name, actions);
|
||||
fail::cfg(name, actions).unwrap();
|
||||
|
||||
// We recognize one extra "action" that's not natively recognized
|
||||
// by the failpoints crate: exit, to immediately kill the process
|
||||
if actions == "exit" {
|
||||
fail::cfg_callback(name, || {
|
||||
info!("Exit requested by failpoint");
|
||||
std::process::exit(1);
|
||||
})
|
||||
.unwrap();
|
||||
} else {
|
||||
fail::cfg(name, actions).unwrap();
|
||||
}
|
||||
} else {
|
||||
bail!("Invalid failpoints format");
|
||||
}
|
||||
|
||||
@@ -45,14 +45,14 @@ def test_pageserver_recovery(zenith_env_builder: ZenithEnvBuilder):
|
||||
|
||||
# Configure failpoints
|
||||
pscur.execute(
|
||||
"failpoints checkpoint-before-sync=sleep(2000);checkpoint-after-sync=panic")
|
||||
"failpoints checkpoint-before-sync=sleep(2000);checkpoint-after-sync=exit")
|
||||
|
||||
# Do some updates until pageserver is crashed
|
||||
try:
|
||||
while True:
|
||||
cur.execute("update foo set x=x+1")
|
||||
except Exception as err:
|
||||
log.info(f"Excepted server crash {err}")
|
||||
log.info(f"Expected server crash {err}")
|
||||
|
||||
log.info("Wait before server restart")
|
||||
env.pageserver.stop()
|
||||
|
||||
Reference in New Issue
Block a user