implement failpoint lib replacement

This commit is contained in:
Christian Schwarz
2025-07-11 11:59:01 +02:00
parent cec0543b51
commit f29b9737cc
11 changed files with 758 additions and 141 deletions

View File

@@ -1,9 +1,12 @@
//! Failpoint support code shared between pageserver and safekeepers.
//!
//! This module provides a compatibility layer over the new neon_failpoint crate.
use tokio_util::sync::CancellationToken;
pub use neon_failpoint::{configure_failpoint as apply_failpoint, init, has_failpoints};
/// Declare a failpoint that can use to `pause` failpoint action.
/// We don't want to block the executor thread, hence, spawn_blocking + await.
/// This is now a compatibility wrapper around the new neon_failpoint crate.
///
/// Optionally pass a cancellation token, and this failpoint will drop out of
/// its pause when the cancellation token fires. This is useful for testing
@@ -11,9 +14,6 @@ use tokio_util::sync::CancellationToken;
/// The macro evaluates to a Result in that case, where Ok(()) is the case
/// where the failpoint was not paused, and Err() is the case where cancellation
/// token fired while evaluating the failpoint.
///
/// Remember to unpause the failpoint in the test; until that happens, one of the
/// limited number of spawn_blocking thread pool threads is leaked.
#[macro_export]
macro_rules! pausable_failpoint {
($name:literal) => {{
@@ -24,26 +24,10 @@ macro_rules! pausable_failpoint {
}};
($name:literal, $cancel:expr) => {{
if cfg!(feature = "testing") {
let failpoint_fut = ::tokio::task::spawn_blocking({
let current = ::tracing::Span::current();
move || {
let _entered = current.entered();
::tracing::info!("at failpoint {}", $name);
::fail::fail_point!($name);
}
});
let cancel_fut = async move {
$cancel.cancelled().await;
};
::tokio::select! {
res = failpoint_fut => {
res.expect("spawn_blocking");
// continue with execution
Ok(())
},
_ = cancel_fut => {
Err(())
}
match ::neon_failpoint::failpoint_with_cancellation($name, None, $cancel).await {
::neon_failpoint::FailpointResult::Continue => Ok(()),
::neon_failpoint::FailpointResult::Return(_) => Ok(()),
::neon_failpoint::FailpointResult::Cancelled => Err(()),
}
} else {
Ok(())
@@ -53,7 +37,7 @@ macro_rules! pausable_failpoint {
pub use pausable_failpoint;
/// use with fail::cfg("$name", "return(2000)")
/// use with neon_failpoint::configure_failpoint("$name", "sleep(2000)")
///
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
/// specified time (in milliseconds). The main difference is that we use async
@@ -66,120 +50,32 @@ pub use pausable_failpoint;
#[macro_export]
macro_rules! __failpoint_sleep_millis_async {
($name:literal) => {{
// If the failpoint is used with a "return" action, set should_sleep to the
// returned value (as string). Otherwise it's set to None.
let should_sleep = (|| {
::fail::fail_point!($name, |x| x);
::std::option::Option::None
})();
// Sleep if the action was a returned value
if let ::std::option::Option::Some(duration_str) = should_sleep {
$crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
if cfg!(feature = "testing") {
::neon_failpoint::failpoint($name, None).await;
}
}};
($name:literal, $cancel:expr) => {{
// If the failpoint is used with a "return" action, set should_sleep to the
// returned value (as string). Otherwise it's set to None.
let should_sleep = (|| {
::fail::fail_point!($name, |x| x);
::std::option::Option::None
})();
// Sleep if the action was a returned value
if let ::std::option::Option::Some(duration_str) = should_sleep {
$crate::failpoint_support::failpoint_sleep_cancellable_helper(
$name,
duration_str,
$cancel,
)
.await
if cfg!(feature = "testing") {
::neon_failpoint::failpoint_with_cancellation($name, None, $cancel).await;
}
}};
}
pub use __failpoint_sleep_millis_async as sleep_millis_async;
// Helper function used by the macro. (A function has nicer scoping so we
// don't need to decorate everything with "::")
// Helper functions are no longer needed as the new implementation handles this internally
// but we keep them for backward compatibility
#[doc(hidden)]
pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
let millis = duration_str.parse::<u64>().unwrap();
let d = std::time::Duration::from_millis(millis);
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
tokio::time::sleep(d).await;
tracing::info!("failpoint {:?}: sleep done", name);
pub async fn failpoint_sleep_helper(_name: &'static str, _duration_str: String) {
// This is now handled by the neon_failpoint crate internally
tracing::warn!("failpoint_sleep_helper is deprecated, use neon_failpoint directly");
}
// Helper function used by the macro. (A function has nicer scoping so we
// don't need to decorate everything with "::")
#[doc(hidden)]
pub async fn failpoint_sleep_cancellable_helper(
name: &'static str,
duration_str: String,
cancel: &CancellationToken,
_name: &'static str,
_duration_str: String,
_cancel: &CancellationToken,
) {
let millis = duration_str.parse::<u64>().unwrap();
let d = std::time::Duration::from_millis(millis);
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
tokio::time::timeout(d, cancel.cancelled()).await.ok();
tracing::info!("failpoint {:?}: sleep done", name);
}
/// Initialize the configured failpoints
///
/// You must call this function before any concurrent threads do operations.
pub fn init() -> fail::FailScenario<'static> {
// The failpoints lib provides support for parsing the `FAILPOINTS` env var.
// We want non-default behavior for `exit`, though, so, we handle it separately.
//
// Format for FAILPOINTS is "name=actions" separated by ";".
let actions = std::env::var("FAILPOINTS");
if actions.is_ok() {
// SAFETY: this function should before any threads start and access env vars concurrently
unsafe {
std::env::remove_var("FAILPOINTS");
}
} else {
// let the library handle non-utf8, or nothing for not present
}
let scenario = fail::FailScenario::setup();
if let Ok(val) = actions {
val.split(';')
.enumerate()
.map(|(i, s)| s.split_once('=').ok_or((i, s)))
.for_each(|res| {
let (name, actions) = match res {
Ok(t) => t,
Err((i, s)) => {
panic!(
"startup failpoints: missing action on the {}th failpoint; try `{s}=return`",
i + 1,
);
}
};
if let Err(e) = apply_failpoint(name, actions) {
panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}");
}
});
}
scenario
}
pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
if actions == "exit" {
fail::cfg_callback(name, exit_failpoint)
} else {
fail::cfg(name, actions)
}
}
#[inline(never)]
fn exit_failpoint() {
tracing::info!("Exit requested by failpoint");
std::process::exit(1);
// This is now handled by the neon_failpoint crate internally
tracing::warn!("failpoint_sleep_cancellable_helper is deprecated, use neon_failpoint directly");
}