diff --git a/Cargo.lock b/Cargo.lock index 8e0ad7c8ee..73cb83d3a7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5880,6 +5880,7 @@ dependencies = [ "chrono", "const_format", "criterion", + "fail", "futures", "heapless", "hex", diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index be41b610b8..dea925b468 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -557,19 +557,6 @@ pub enum DownloadRemoteLayersTaskState { ShutDown, } -pub type ConfigureFailpointsRequest = Vec; - -/// Information for configuring a single fail point -#[derive(Debug, Serialize, Deserialize)] -pub struct FailpointConfig { - /// Name of the fail point - pub name: String, - /// List of actions to take, using the format described in `fail::cfg` - /// - /// We also support `actions = "exit"` to cause the fail point to immediately exit. - pub actions: String, -} - #[derive(Debug, Serialize, Deserialize)] pub struct TimelineGcRequest { pub gc_horizon: Option, diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index af0414daa2..706b7a3187 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -4,6 +4,12 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +default = [] +# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, +# which adds some runtime cost to run tests on outage conditions +testing = ["fail/failpoints"] + [dependencies] arc-swap.workspace = true sentry.workspace = true @@ -16,6 +22,7 @@ chrono.workspace = true heapless.workspace = true hex = { workspace = true, features = ["serde"] } hyper = { workspace = true, features = ["full"] } +fail.workspace = true futures = { workspace = true} jsonwebtoken.workspace = true nix.workspace = true diff --git a/pageserver/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs similarity index 61% rename from pageserver/src/failpoint_support.rs rename to libs/utils/src/failpoint_support.rs index 2190eba18a..5ec532e2a6 100644 --- a/pageserver/src/failpoint_support.rs +++ b/libs/utils/src/failpoint_support.rs @@ -1,3 +1,14 @@ +//! Failpoint support code shared between pageserver and safekeepers. + +use crate::http::{ + error::ApiError, + json::{json_request, json_response}, +}; +use hyper::{Body, Request, Response, StatusCode}; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; +use tracing::*; + /// use with fail::cfg("$name", "return(2000)") /// /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the @@ -25,7 +36,7 @@ pub use __failpoint_sleep_millis_async as sleep_millis_async; // Helper function used by the macro. (A function has nicer scoping so we // don't need to decorate everything with "::") #[doc(hidden)] -pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) { +pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) { let millis = duration_str.parse::().unwrap(); let d = std::time::Duration::from_millis(millis); @@ -71,7 +82,7 @@ pub fn init() -> fail::FailScenario<'static> { scenario } -pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> { +pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> { if actions == "exit" { fail::cfg_callback(name, exit_failpoint) } else { @@ -84,3 +95,45 @@ fn exit_failpoint() { tracing::info!("Exit requested by failpoint"); std::process::exit(1); } + +pub type ConfigureFailpointsRequest = Vec; + +/// Information for configuring a single fail point +#[derive(Debug, Serialize, Deserialize)] +pub struct FailpointConfig { + /// Name of the fail point + pub name: String, + /// List of actions to take, using the format described in `fail::cfg` + /// + /// We also support `actions = "exit"` to cause the fail point to immediately exit. + pub actions: String, +} + +/// Configure failpoints through http. +pub async fn failpoints_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + if !fail::has_failpoints() { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Cannot manage failpoints because storage was compiled without failpoints support" + ))); + } + + let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; + for fp in failpoints { + info!("cfg failpoint: {} {}", fp.name, fp.actions); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + let cfg_result = apply_failpoint(&fp.name, &fp.actions); + + if let Err(err_msg) = cfg_result { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Failed to configure failpoints: {err_msg}" + ))); + } + } + + json_response(StatusCode::OK, ()) +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index bb6c848bf4..9e9b0adfe5 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -83,6 +83,8 @@ pub mod timeout; pub mod sync; +pub mod failpoint_support; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index f65c4f4580..621ad050f4 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -31,6 +31,7 @@ use pageserver::{ virtual_file, }; use postgres_backend::AuthType; +use utils::failpoint_support; use utils::logging::TracingErrorLayerEnablement; use utils::signals::ShutdownSignals; use utils::{ @@ -126,7 +127,7 @@ fn main() -> anyhow::Result<()> { } // Initialize up failpoints support - let scenario = pageserver::failpoint_support::init(); + let scenario = failpoint_support::init(); // Basic initialization of things that don't change after startup virtual_file::init(conf.max_file_descriptors); diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 11a3a2c872..157e6b4e3e 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -25,6 +25,7 @@ use tenant_size_model::{SizeResult, StorageModel}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; +use utils::failpoint_support::failpoints_handler; use utils::http::endpoint::request_span; use utils::http::json::json_request_or_empty_body; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; @@ -66,9 +67,6 @@ use utils::{ lsn::Lsn, }; -// Imports only used for testing APIs -use pageserver_api::models::ConfigureFailpointsRequest; - // For APIs that require an Active tenant, how long should we block waiting for that state? // This is not functionally necessary (clients will retry), but avoids generating a lot of // failed API calls while tenants are activating. @@ -1293,34 +1291,6 @@ async fn handle_tenant_break( json_response(StatusCode::OK, ()) } -async fn failpoints_handler( - mut request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - if !fail::has_failpoints() { - return Err(ApiError::BadRequest(anyhow!( - "Cannot manage failpoints because pageserver was compiled without failpoints support" - ))); - } - - let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; - for fp in failpoints { - info!("cfg failpoint: {} {}", fp.name, fp.actions); - - // We recognize one extra "action" that's not natively recognized - // by the failpoints crate: exit, to immediately kill the process - let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions); - - if let Err(err_msg) = cfg_result { - return Err(ApiError::BadRequest(anyhow!( - "Failed to configure failpoints: {err_msg}" - ))); - } - } - - json_response(StatusCode::OK, ()) -} - // Run GC immediately on given timeline. async fn timeline_gc_handler( mut request: Request, diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 58adf6e8c4..c1ce0af47b 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -25,8 +25,6 @@ pub mod walingest; pub mod walrecord; pub mod walredo; -pub mod failpoint_support; - use crate::task_mgr::TaskKind; use camino::Utf8Path; use deletion_queue::DeletionQueue; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2f2169d194..e50987c84b 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -33,6 +33,7 @@ use tracing::*; use utils::backoff; use utils::completion; use utils::crashsafe::path_with_suffix_extension; +use utils::failpoint_support; use utils::fs_ext; use utils::sync::gate::Gate; use utils::sync::gate::GateGuard; @@ -890,7 +891,7 @@ impl Tenant { ) -> anyhow::Result<()> { span::debug_assert_current_span_has_tenant_id(); - crate::failpoint_support::sleep_millis_async!("before-attaching-tenant"); + failpoint_support::sleep_millis_async!("before-attaching-tenant"); let preload = match preload { Some(p) => p, @@ -1002,7 +1003,7 @@ impl Tenant { // IndexPart is the source of truth. self.clean_up_timelines(&existent_timelines)?; - crate::failpoint_support::sleep_millis_async!("attach-before-activate"); + failpoint_support::sleep_millis_async!("attach-before-activate"); info!("Done"); @@ -2839,9 +2840,7 @@ impl Tenant { } }; - crate::failpoint_support::sleep_millis_async!( - "gc_iteration_internal_after_getting_gc_timelines" - ); + failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines"); // If there is nothing to GC, we don't want any messages in the INFO log. if !gc_timelines.is_empty() { diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 1d14214030..a6a8972970 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -29,6 +29,7 @@ use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; use anyhow::{bail, Context, Result}; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; +use utils::failpoint_support; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; @@ -344,9 +345,7 @@ impl<'a> WalIngest<'a> { // particular point in the WAL. For more fine-grained control, // we could peek into the message and only pause if it contains // a particular string, for example, but this is enough for now. - crate::failpoint_support::sleep_millis_async!( - "wal-ingest-logical-message-sleep" - ); + failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep"); } else if let Some(path) = prefix.strip_prefix("neon-file:") { modification.put_file(path, message, ctx).await?; }