mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-20 14:40:37 +00:00
Always compile failpoints support, add runtime config option to enable it.
It's annoying that many of the tests required a special build with the "testing" feature. I think it's better to have a runtime check. It adds a few CPU instructions to where failpoints are defined, even if they are disabled, but that's a small price to pay for the convenience. Fixes issue 2531, although differently from what was discussed on that issue.
This commit is contained in:
@@ -12,7 +12,6 @@
|
||||
//!
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::io;
|
||||
@@ -22,6 +21,7 @@ use std::time::SystemTime;
|
||||
use tar::{Builder, EntryType, Header};
|
||||
use tracing::*;
|
||||
|
||||
use crate::fail_point;
|
||||
use crate::tenant::Timeline;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
|
||||
|
||||
@@ -35,10 +35,6 @@ project_git_version!(GIT_VERSION);
|
||||
const PID_FILE_NAME: &str = "pageserver.pid";
|
||||
|
||||
const FEATURES: &[&str] = &[
|
||||
#[cfg(feature = "testing")]
|
||||
"testing",
|
||||
#[cfg(feature = "fail/failpoints")]
|
||||
"fail/failpoints",
|
||||
#[cfg(feature = "profiling")]
|
||||
"profiling",
|
||||
];
|
||||
@@ -178,6 +174,10 @@ fn initialize_config(
|
||||
let conf = PageServerConf::parse_and_validate(&toml, workdir)
|
||||
.context("Failed to parse pageserver configuration")?;
|
||||
|
||||
if pageserver::TESTING_MODE.set(conf.testing_mode).is_err() {
|
||||
anyhow::bail!("testing_mode was already initialized");
|
||||
}
|
||||
|
||||
if update_config {
|
||||
info!("Writing pageserver config to '{}'", cfg_file_path.display());
|
||||
|
||||
@@ -206,16 +206,22 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
|
||||
// If any failpoints were set from FAILPOINTS environment variable,
|
||||
// print them to the log for debugging purposes
|
||||
let failpoints = fail::list();
|
||||
if !failpoints.is_empty() {
|
||||
info!(
|
||||
"started with failpoints: {}",
|
||||
failpoints
|
||||
.iter()
|
||||
.map(|(name, actions)| format!("{name}={actions}"))
|
||||
.collect::<Vec<String>>()
|
||||
.join(";")
|
||||
)
|
||||
if *pageserver::TESTING_MODE.get().unwrap() {
|
||||
let failpoints = fail::list();
|
||||
if !failpoints.is_empty() {
|
||||
info!(
|
||||
"started with testing mode enabled, failpoints: {}",
|
||||
failpoints
|
||||
.iter()
|
||||
.map(|(name, actions)| format!("{name}={actions}"))
|
||||
.collect::<Vec<String>>()
|
||||
.join(";")
|
||||
)
|
||||
} else {
|
||||
info!("started with testing mode enabled");
|
||||
}
|
||||
} else {
|
||||
info!("started with testing mode disabled");
|
||||
}
|
||||
|
||||
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
|
||||
|
||||
@@ -53,6 +53,8 @@ pub mod defaults {
|
||||
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
|
||||
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
|
||||
|
||||
pub const DEFAULT_TESTING_MODE: bool = false;
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -75,6 +77,8 @@ pub mod defaults {
|
||||
|
||||
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
|
||||
|
||||
testing_mode = false
|
||||
|
||||
# [tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -143,6 +147,9 @@ pub struct PageServerConf {
|
||||
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
|
||||
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
|
||||
|
||||
/// Enables failpoint support and extra mgmt APIs useful for testing.
|
||||
pub testing_mode: bool,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -222,6 +229,8 @@ struct PageServerConfigBuilder {
|
||||
log_format: BuilderValue<LogFormat>,
|
||||
|
||||
concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
|
||||
|
||||
testing_mode: BuilderValue<bool>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -252,6 +261,8 @@ impl Default for PageServerConfigBuilder {
|
||||
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
|
||||
|
||||
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
|
||||
|
||||
testing_mode: Set(DEFAULT_TESTING_MODE),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -332,6 +343,10 @@ impl PageServerConfigBuilder {
|
||||
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
|
||||
}
|
||||
|
||||
pub fn testing_mode(&mut self, testing_mode: bool) {
|
||||
self.testing_mode = BuilderValue::Set(testing_mode);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
Ok(PageServerConf {
|
||||
listen_pg_addr: self
|
||||
@@ -380,6 +395,7 @@ impl PageServerConfigBuilder {
|
||||
.ok_or(anyhow!(
|
||||
"missing concurrent_tenant_size_logical_size_queries"
|
||||
))?,
|
||||
testing_mode: self.testing_mode.ok_or(anyhow!("missing testing_mode"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -560,6 +576,7 @@ impl PageServerConf {
|
||||
let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
|
||||
ConfigurableSemaphore::new(permits)
|
||||
}),
|
||||
"testing_mode" => builder.testing_mode(parse_toml_bool(key, item)?),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -681,6 +698,7 @@ impl PageServerConf {
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
testing_mode: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -694,6 +712,12 @@ fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
|
||||
Ok(s.to_string())
|
||||
}
|
||||
|
||||
fn parse_toml_bool(name: &str, item: &Item) -> Result<bool> {
|
||||
Ok(item
|
||||
.as_bool()
|
||||
.with_context(|| format!("configure option {name} is not a boolean"))?)
|
||||
}
|
||||
|
||||
fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
|
||||
// A toml integer is signed, so it cannot represent the full range of an u64. That's OK
|
||||
// for our use, though.
|
||||
@@ -870,6 +894,7 @@ log_format = 'json'
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
testing_mode: defaults::DEFAULT_TESTING_MODE,
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -916,6 +941,7 @@ log_format = 'json'
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
log_format: LogFormat::Json,
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
testing_mode: defaults::DEFAULT_TESTING_MODE,
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -7,12 +7,14 @@ use remote_storage::GenericRemoteStorage;
|
||||
use tracing::*;
|
||||
|
||||
use super::models::{
|
||||
LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, TenantConfigRequest,
|
||||
TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo,
|
||||
ConfigureFailpointsRequest, LocalTimelineInfo, RemoteTimelineInfo, StatusResponse,
|
||||
TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::{config::PageServerConf, tenant_mgr};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
@@ -27,12 +29,6 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
// Imports only used for testing APIs
|
||||
#[cfg(feature = "testing")]
|
||||
use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
|
||||
#[cfg(feature = "testing")]
|
||||
use crate::CheckpointConfig;
|
||||
|
||||
struct State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
@@ -695,7 +691,6 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
if !fail::has_failpoints() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
@@ -729,7 +724,6 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
}
|
||||
|
||||
// Run GC immediately on given timeline.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
@@ -748,7 +742,6 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
|
||||
}
|
||||
|
||||
// Run compaction immediately on given timeline.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
@@ -769,7 +762,6 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
|
||||
}
|
||||
|
||||
// Run checkpoint immediately on given timeline.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
@@ -814,22 +806,26 @@ pub fn make_router(
|
||||
}))
|
||||
}
|
||||
|
||||
// A wrapper around a handler function that returns an error if the server
|
||||
// was not configured with testing_mode enabled. This is used to gate API
|
||||
// functions that should only be used in tests, never in production.
|
||||
macro_rules! testing_api {
|
||||
($handler_desc:literal, $handler:path $(,)?) => {{
|
||||
#[cfg(not(feature = "testing"))]
|
||||
async fn cfg_disabled(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
Err(ApiError::BadRequest(anyhow!(concat!(
|
||||
"Cannot ",
|
||||
$handler_desc,
|
||||
" because pageserver was compiled without testing APIs",
|
||||
))))
|
||||
use futures::FutureExt;
|
||||
|req: Request<Body>| {
|
||||
if conf.testing_mode {
|
||||
$handler(req).left_future()
|
||||
} else {
|
||||
async {
|
||||
Err(ApiError::BadRequest(anyhow!(concat!(
|
||||
"Cannot ",
|
||||
$handler_desc,
|
||||
" because pageserver was configured without testing APIs",
|
||||
))))
|
||||
}
|
||||
.right_future()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
let handler = $handler;
|
||||
#[cfg(not(feature = "testing"))]
|
||||
let handler = cfg_disabled;
|
||||
handler
|
||||
}};
|
||||
}
|
||||
|
||||
|
||||
@@ -148,6 +148,28 @@ pub fn is_uninit_mark(path: &Path) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Wrapper around fail::fail_point! macro that returns quickly if testing_mode was
|
||||
/// disabled in the pageserver config. Also enabled in unit tests.
|
||||
///
|
||||
/// fail::fail_point! is fairly quick, but it does acquire an RwLock and perform a HashMap
|
||||
/// lookup. This macro is hopefully cheap enough that we don't need to worry about the
|
||||
/// overhead even in production, and even if the macro is used in hot spots. (This check
|
||||
/// compiles to two cmp instructions; get_unchecked() would shrink it to one.)
|
||||
///
|
||||
#[macro_export]
|
||||
macro_rules! fail_point {
|
||||
($($name:expr),*) => {{
|
||||
if cfg!(test) || *crate::TESTING_MODE.get().expect("testing_mode not initialized") {
|
||||
fail::fail_point!($($name), *)
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// This is set early in the pageserver startup, from the "testing_mode" setting in the
|
||||
/// config file.
|
||||
pub static TESTING_MODE: once_cell::sync::OnceCell<bool> = once_cell::sync::OnceCell::new();
|
||||
|
||||
#[cfg(test)]
|
||||
mod backoff_defaults_tests {
|
||||
use super::*;
|
||||
|
||||
@@ -12,7 +12,7 @@ pub(super) async fn delete_layer<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
local_layer_path: &'a Path,
|
||||
) -> anyhow::Result<()> {
|
||||
fail::fail_point!("before-delete-layer", |_| {
|
||||
crate::fail_point!("before-delete-layer", |_| {
|
||||
anyhow::bail!("failpoint before-delete-layer")
|
||||
});
|
||||
debug!("Deleting layer from remote storage: {local_layer_path:?}",);
|
||||
|
||||
@@ -97,7 +97,7 @@ pub async fn download_layer_file<'a>(
|
||||
})?;
|
||||
drop(destination_file);
|
||||
|
||||
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||
crate::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||
bail!("remote-storage-download-pre-rename failpoint triggered")
|
||||
});
|
||||
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
//! Helper functions to upload files to remote storage with a RemoteStorage
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use fail::fail_point;
|
||||
use std::path::Path;
|
||||
use tokio::fs;
|
||||
|
||||
use super::index::IndexPart;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::fail_point;
|
||||
use crate::storage_sync::LayerFileMetadata;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
@@ -46,6 +46,7 @@ use std::time::{Duration, Instant};
|
||||
|
||||
use self::metadata::TimelineMetadata;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::fail_point;
|
||||
use crate::import_datadir;
|
||||
use crate::is_uninit_mark;
|
||||
use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
|
||||
@@ -255,7 +256,7 @@ impl UninitializedTimeline<'_> {
|
||||
// Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
|
||||
raw_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
bail!("failpoint before-checkpoint-new-timeline");
|
||||
});
|
||||
|
||||
@@ -2098,7 +2099,7 @@ impl Tenant {
|
||||
// Thus spawn flush loop manually and skip flush_loop setup in initialize_with_lock
|
||||
unfinished_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
anyhow::bail!("failpoint before-checkpoint-new-timeline");
|
||||
});
|
||||
|
||||
@@ -2192,7 +2193,7 @@ impl Tenant {
|
||||
.context("Failed to create timeline data structure")?;
|
||||
crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?;
|
||||
|
||||
fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
|
||||
fail_point!("after-timeline-uninit-mark-creation", |_| {
|
||||
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
|
||||
});
|
||||
|
||||
@@ -2381,7 +2382,7 @@ fn try_create_target_tenant_dir(
|
||||
temporary_tenant_timelines_dir.display()
|
||||
)
|
||||
})?;
|
||||
fail::fail_point!("tenant-creation-before-tmp-rename", |_| {
|
||||
fail_point!("tenant-creation-before-tmp-rename", |_| {
|
||||
anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
|
||||
});
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use bytes::Bytes;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::models::TimelineState;
|
||||
@@ -19,6 +18,7 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use crate::fail_point;
|
||||
use crate::storage_sync::index::IndexPart;
|
||||
use crate::storage_sync::RemoteTimelineClient;
|
||||
use crate::tenant::{
|
||||
|
||||
@@ -12,16 +12,19 @@ use once_cell::sync::Lazy;
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::*;
|
||||
|
||||
use pageserver_api::models::TimelineGcRequest;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::crashsafe;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::repository::GcResult;
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::IGNORED_TENANT_FILE_NAME;
|
||||
|
||||
use utils::fs_ext::PathExt;
|
||||
use utils::http::error::ApiError;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
|
||||
@@ -460,13 +463,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use {
|
||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub async fn immediate_gc(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
@@ -494,7 +490,7 @@ pub async fn immediate_gc(
|
||||
&format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
|
||||
false,
|
||||
async move {
|
||||
fail::fail_point!("immediate_gc_task_pre");
|
||||
crate::fail_point!("immediate_gc_task_pre");
|
||||
let result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
|
||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
||||
|
||||
@@ -9,7 +9,6 @@ use std::{
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use bytes::BytesMut;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
use fail::fail_point;
|
||||
use futures::StreamExt;
|
||||
use postgres::{SimpleQueryMessage, SimpleQueryRow};
|
||||
use postgres_ffi::v14::xlog_utils::normalize_lsn;
|
||||
@@ -20,6 +19,7 @@ use tokio::{pin, select, sync::watch, time};
|
||||
use tokio_postgres::{replication::ReplicationStream, Client};
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
|
||||
use crate::fail_point;
|
||||
use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
|
||||
use crate::{
|
||||
task_mgr,
|
||||
|
||||
Reference in New Issue
Block a user