Always compile failpoints support, add runtime config option to enable it.

It's annoying that many of the tests required a special build with the
"testing" feature. I think it's better to have a runtime check. It adds
a few CPU instructions to where failpoints are defined, even if they are
disabled, but that's a small price to pay for the convenience.

Fixes issue 2531, although differently from what was discussed on that
issue.
This commit is contained in:
Heikki Linnakangas
2022-12-07 23:36:51 +02:00
parent f5a735ac3b
commit e0c43396bf
23 changed files with 120 additions and 98 deletions

View File

@@ -12,7 +12,6 @@
//!
use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::{BufMut, BytesMut};
use fail::fail_point;
use itertools::Itertools;
use std::fmt::Write as FmtWrite;
use std::io;
@@ -22,6 +21,7 @@ use std::time::SystemTime;
use tar::{Builder, EntryType, Header};
use tracing::*;
use crate::fail_point;
use crate::tenant::Timeline;
use pageserver_api::reltag::{RelTag, SlruKind};

View File

@@ -35,10 +35,6 @@ project_git_version!(GIT_VERSION);
const PID_FILE_NAME: &str = "pageserver.pid";
const FEATURES: &[&str] = &[
#[cfg(feature = "testing")]
"testing",
#[cfg(feature = "fail/failpoints")]
"fail/failpoints",
#[cfg(feature = "profiling")]
"profiling",
];
@@ -178,6 +174,10 @@ fn initialize_config(
let conf = PageServerConf::parse_and_validate(&toml, workdir)
.context("Failed to parse pageserver configuration")?;
if pageserver::TESTING_MODE.set(conf.testing_mode).is_err() {
anyhow::bail!("testing_mode was already initialized");
}
if update_config {
info!("Writing pageserver config to '{}'", cfg_file_path.display());
@@ -206,16 +206,22 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
// If any failpoints were set from FAILPOINTS environment variable,
// print them to the log for debugging purposes
let failpoints = fail::list();
if !failpoints.is_empty() {
info!(
"started with failpoints: {}",
failpoints
.iter()
.map(|(name, actions)| format!("{name}={actions}"))
.collect::<Vec<String>>()
.join(";")
)
if *pageserver::TESTING_MODE.get().unwrap() {
let failpoints = fail::list();
if !failpoints.is_empty() {
info!(
"started with testing mode enabled, failpoints: {}",
failpoints
.iter()
.map(|(name, actions)| format!("{name}={actions}"))
.collect::<Vec<String>>()
.join(";")
)
} else {
info!("started with testing mode enabled");
}
} else {
info!("started with testing mode disabled");
}
let lock_file_path = conf.workdir.join(PID_FILE_NAME);

View File

@@ -53,6 +53,8 @@ pub mod defaults {
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
pub const DEFAULT_TESTING_MODE: bool = false;
///
/// Default built-in configuration file.
///
@@ -75,6 +77,8 @@ pub mod defaults {
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
testing_mode = false
# [tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -143,6 +147,9 @@ pub struct PageServerConf {
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
/// Enables failpoint support and extra mgmt APIs useful for testing.
pub testing_mode: bool,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -222,6 +229,8 @@ struct PageServerConfigBuilder {
log_format: BuilderValue<LogFormat>,
concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
testing_mode: BuilderValue<bool>,
}
impl Default for PageServerConfigBuilder {
@@ -252,6 +261,8 @@ impl Default for PageServerConfigBuilder {
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
testing_mode: Set(DEFAULT_TESTING_MODE),
}
}
}
@@ -332,6 +343,10 @@ impl PageServerConfigBuilder {
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
}
pub fn testing_mode(&mut self, testing_mode: bool) {
self.testing_mode = BuilderValue::Set(testing_mode);
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
Ok(PageServerConf {
listen_pg_addr: self
@@ -380,6 +395,7 @@ impl PageServerConfigBuilder {
.ok_or(anyhow!(
"missing concurrent_tenant_size_logical_size_queries"
))?,
testing_mode: self.testing_mode.ok_or(anyhow!("missing testing_mode"))?,
})
}
}
@@ -560,6 +576,7 @@ impl PageServerConf {
let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
ConfigurableSemaphore::new(permits)
}),
"testing_mode" => builder.testing_mode(parse_toml_bool(key, item)?),
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -681,6 +698,7 @@ impl PageServerConf {
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
testing_mode: true,
}
}
}
@@ -694,6 +712,12 @@ fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
Ok(s.to_string())
}
fn parse_toml_bool(name: &str, item: &Item) -> Result<bool> {
Ok(item
.as_bool()
.with_context(|| format!("configure option {name} is not a boolean"))?)
}
fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
// A toml integer is signed, so it cannot represent the full range of an u64. That's OK
// for our use, though.
@@ -870,6 +894,7 @@ log_format = 'json'
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
testing_mode: defaults::DEFAULT_TESTING_MODE,
},
"Correct defaults should be used when no config values are provided"
);
@@ -916,6 +941,7 @@ log_format = 'json'
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
log_format: LogFormat::Json,
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
testing_mode: defaults::DEFAULT_TESTING_MODE,
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -7,12 +7,14 @@ use remote_storage::GenericRemoteStorage;
use tracing::*;
use super::models::{
LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, TenantConfigRequest,
TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo,
ConfigureFailpointsRequest, LocalTimelineInfo, RemoteTimelineInfo, StatusResponse,
TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::tenant::Timeline;
use crate::tenant_config::TenantConfOpt;
use crate::CheckpointConfig;
use crate::{config::PageServerConf, tenant_mgr};
use utils::{
auth::JwtAuth,
@@ -27,12 +29,6 @@ use utils::{
lsn::Lsn,
};
// Imports only used for testing APIs
#[cfg(feature = "testing")]
use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
#[cfg(feature = "testing")]
use crate::CheckpointConfig;
struct State {
conf: &'static PageServerConf,
auth: Option<Arc<JwtAuth>>,
@@ -695,7 +691,6 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
json_response(StatusCode::OK, ())
}
#[cfg(feature = "testing")]
async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
if !fail::has_failpoints() {
return Err(ApiError::BadRequest(anyhow!(
@@ -729,7 +724,6 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
}
// Run GC immediately on given timeline.
#[cfg(feature = "testing")]
async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -748,7 +742,6 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
}
// Run compaction immediately on given timeline.
#[cfg(feature = "testing")]
async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -769,7 +762,6 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
}
// Run checkpoint immediately on given timeline.
#[cfg(feature = "testing")]
async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -814,22 +806,26 @@ pub fn make_router(
}))
}
// A wrapper around a handler function that returns an error if the server
// was not configured with testing_mode enabled. This is used to gate API
// functions that should only be used in tests, never in production.
macro_rules! testing_api {
($handler_desc:literal, $handler:path $(,)?) => {{
#[cfg(not(feature = "testing"))]
async fn cfg_disabled(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
Err(ApiError::BadRequest(anyhow!(concat!(
"Cannot ",
$handler_desc,
" because pageserver was compiled without testing APIs",
))))
use futures::FutureExt;
|req: Request<Body>| {
if conf.testing_mode {
$handler(req).left_future()
} else {
async {
Err(ApiError::BadRequest(anyhow!(concat!(
"Cannot ",
$handler_desc,
" because pageserver was configured without testing APIs",
))))
}
.right_future()
}
}
#[cfg(feature = "testing")]
let handler = $handler;
#[cfg(not(feature = "testing"))]
let handler = cfg_disabled;
handler
}};
}

View File

@@ -148,6 +148,28 @@ pub fn is_uninit_mark(path: &Path) -> bool {
}
}
///
/// Wrapper around fail::fail_point! macro that returns quickly if testing_mode was
/// disabled in the pageserver config. Also enabled in unit tests.
///
/// fail::fail_point! is fairly quick, but it does acquire an RwLock and perform a HashMap
/// lookup. This macro is hopefully cheap enough that we don't need to worry about the
/// overhead even in production, and even if the macro is used in hot spots. (This check
/// compiles to two cmp instructions; get_unchecked() would shrink it to one.)
///
#[macro_export]
macro_rules! fail_point {
($($name:expr),*) => {{
if cfg!(test) || *crate::TESTING_MODE.get().expect("testing_mode not initialized") {
fail::fail_point!($($name), *)
}
}};
}
/// This is set early in the pageserver startup, from the "testing_mode" setting in the
/// config file.
pub static TESTING_MODE: once_cell::sync::OnceCell<bool> = once_cell::sync::OnceCell::new();
#[cfg(test)]
mod backoff_defaults_tests {
use super::*;

View File

@@ -12,7 +12,7 @@ pub(super) async fn delete_layer<'a>(
storage: &'a GenericRemoteStorage,
local_layer_path: &'a Path,
) -> anyhow::Result<()> {
fail::fail_point!("before-delete-layer", |_| {
crate::fail_point!("before-delete-layer", |_| {
anyhow::bail!("failpoint before-delete-layer")
});
debug!("Deleting layer from remote storage: {local_layer_path:?}",);

View File

@@ -97,7 +97,7 @@ pub async fn download_layer_file<'a>(
})?;
drop(destination_file);
fail::fail_point!("remote-storage-download-pre-rename", |_| {
crate::fail_point!("remote-storage-download-pre-rename", |_| {
bail!("remote-storage-download-pre-rename failpoint triggered")
});

View File

@@ -1,12 +1,12 @@
//! Helper functions to upload files to remote storage with a RemoteStorage
use anyhow::{bail, Context};
use fail::fail_point;
use std::path::Path;
use tokio::fs;
use super::index::IndexPart;
use crate::config::PageServerConf;
use crate::fail_point;
use crate::storage_sync::LayerFileMetadata;
use remote_storage::GenericRemoteStorage;
use utils::id::{TenantId, TimelineId};

View File

@@ -46,6 +46,7 @@ use std::time::{Duration, Instant};
use self::metadata::TimelineMetadata;
use crate::config::PageServerConf;
use crate::fail_point;
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
@@ -255,7 +256,7 @@ impl UninitializedTimeline<'_> {
// Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
raw_timeline.maybe_spawn_flush_loop();
fail::fail_point!("before-checkpoint-new-timeline", |_| {
fail_point!("before-checkpoint-new-timeline", |_| {
bail!("failpoint before-checkpoint-new-timeline");
});
@@ -2098,7 +2099,7 @@ impl Tenant {
// Thus spawn flush loop manually and skip flush_loop setup in initialize_with_lock
unfinished_timeline.maybe_spawn_flush_loop();
fail::fail_point!("before-checkpoint-new-timeline", |_| {
fail_point!("before-checkpoint-new-timeline", |_| {
anyhow::bail!("failpoint before-checkpoint-new-timeline");
});
@@ -2192,7 +2193,7 @@ impl Tenant {
.context("Failed to create timeline data structure")?;
crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?;
fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
fail_point!("after-timeline-uninit-mark-creation", |_| {
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
});
@@ -2381,7 +2382,7 @@ fn try_create_target_tenant_dir(
temporary_tenant_timelines_dir.display()
)
})?;
fail::fail_point!("tenant-creation-before-tmp-rename", |_| {
fail_point!("tenant-creation-before-tmp-rename", |_| {
anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
});

View File

@@ -2,7 +2,6 @@
use anyhow::{anyhow, bail, ensure, Context};
use bytes::Bytes;
use fail::fail_point;
use itertools::Itertools;
use once_cell::sync::OnceCell;
use pageserver_api::models::TimelineState;
@@ -19,6 +18,7 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
use std::time::{Duration, Instant, SystemTime};
use crate::fail_point;
use crate::storage_sync::index::IndexPart;
use crate::storage_sync::RemoteTimelineClient;
use crate::tenant::{

View File

@@ -12,16 +12,19 @@ use once_cell::sync::Lazy;
use tokio::sync::RwLock;
use tracing::*;
use pageserver_api::models::TimelineGcRequest;
use remote_storage::GenericRemoteStorage;
use utils::crashsafe;
use crate::config::PageServerConf;
use crate::repository::GcResult;
use crate::task_mgr::{self, TaskKind};
use crate::tenant::{Tenant, TenantState};
use crate::tenant_config::TenantConfOpt;
use crate::IGNORED_TENANT_FILE_NAME;
use utils::fs_ext::PathExt;
use utils::http::error::ApiError;
use utils::id::{TenantId, TimelineId};
static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
@@ -460,13 +463,6 @@ where
}
}
#[cfg(feature = "testing")]
use {
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
utils::http::error::ApiError,
};
#[cfg(feature = "testing")]
pub async fn immediate_gc(
tenant_id: TenantId,
timeline_id: TimelineId,
@@ -494,7 +490,7 @@ pub async fn immediate_gc(
&format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
false,
async move {
fail::fail_point!("immediate_gc_task_pre");
crate::fail_point!("immediate_gc_task_pre");
let result = tenant
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))

View File

@@ -9,7 +9,6 @@ use std::{
use anyhow::{bail, ensure, Context};
use bytes::BytesMut;
use chrono::{NaiveDateTime, Utc};
use fail::fail_point;
use futures::StreamExt;
use postgres::{SimpleQueryMessage, SimpleQueryRow};
use postgres_ffi::v14::xlog_utils::normalize_lsn;
@@ -20,6 +19,7 @@ use tokio::{pin, select, sync::watch, time};
use tokio_postgres::{replication::ReplicationStream, Client};
use tracing::{debug, error, info, trace, warn};
use crate::fail_point;
use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
use crate::{
task_mgr,