diff --git a/.cargo/config.toml b/.cargo/config.toml index c40783bc1b..76a2ff549e 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -11,6 +11,3 @@ opt-level = 3 [profile.dev] # Turn on a small amount of optimization in Development mode. opt-level = 1 - -[alias] -build_testing = ["build", "--features", "testing"] diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 49f94ad60e..107101ab19 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -100,11 +100,11 @@ jobs: run: | if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FEATURES="--features testing" + CARGO_FEATURES="" CARGO_FLAGS="--locked --timings $CARGO_FEATURES" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FEATURES="--features testing,profiling" + CARGO_FEATURES="--features profiling" CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV @@ -539,9 +539,9 @@ jobs: # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify. # Regular pageserver version string looks like - # Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: [] + # Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c features: [] # Bad versions might loop like: - # Neon page server git-env:local failpoints: true, features: ["testing"] + # Neon page server git-env:local features: [""] # Ensure that we don't have bad versions. - name: Verify image versions shell: bash # ensure no set -e for better error messages @@ -555,11 +555,6 @@ jobs: exit 1 fi - if ! echo "$pageserver_version" | grep -qv '"testing"' ; then - echo "Pageserver version should have no testing feature enabled" - exit 1 - fi - - name: Verify docker-compose example run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh diff --git a/README.md b/README.md index c31bac6446..af4197c4e5 100644 --- a/README.md +++ b/README.md @@ -213,7 +213,7 @@ Ensure your dependencies are installed as described [here](https://github.com/ne ```sh git clone --recursive https://github.com/neondatabase/neon.git -CARGO_BUILD_FLAGS="--features=testing" make +make ./scripts/pytest ``` diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 43d51f90c1..9db07b52e7 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -5,10 +5,6 @@ edition = "2021" [features] default = [] -# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, -# which adds some runtime cost to run tests on outage conditions -testing = ["fail/failpoints"] - profiling = ["pprof"] [dependencies] diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 973c3cd3a6..0964fe2fc9 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -12,7 +12,6 @@ //! use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::{BufMut, BytesMut}; -use fail::fail_point; use itertools::Itertools; use std::fmt::Write as FmtWrite; use std::io; @@ -22,6 +21,7 @@ use std::time::SystemTime; use tar::{Builder, EntryType, Header}; use tracing::*; +use crate::fail_point; use crate::tenant::Timeline; use pageserver_api::reltag::{RelTag, SlruKind}; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index d70b36616b..5110a1a743 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -35,10 +35,6 @@ project_git_version!(GIT_VERSION); const PID_FILE_NAME: &str = "pageserver.pid"; const FEATURES: &[&str] = &[ - #[cfg(feature = "testing")] - "testing", - #[cfg(feature = "fail/failpoints")] - "fail/failpoints", #[cfg(feature = "profiling")] "profiling", ]; @@ -178,6 +174,10 @@ fn initialize_config( let conf = PageServerConf::parse_and_validate(&toml, workdir) .context("Failed to parse pageserver configuration")?; + if pageserver::TESTING_MODE.set(conf.testing_mode).is_err() { + anyhow::bail!("testing_mode was already initialized"); + } + if update_config { info!("Writing pageserver config to '{}'", cfg_file_path.display()); @@ -206,16 +206,22 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes - let failpoints = fail::list(); - if !failpoints.is_empty() { - info!( - "started with failpoints: {}", - failpoints - .iter() - .map(|(name, actions)| format!("{name}={actions}")) - .collect::>() - .join(";") - ) + if *pageserver::TESTING_MODE.get().unwrap() { + let failpoints = fail::list(); + if !failpoints.is_empty() { + info!( + "started with testing mode enabled, failpoints: {}", + failpoints + .iter() + .map(|(name, actions)| format!("{name}={actions}")) + .collect::>() + .join(";") + ) + } else { + info!("started with testing mode enabled"); + } + } else { + info!("started with testing mode disabled"); } let lock_file_path = conf.workdir.join(PID_FILE_NAME); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 86f1fcef94..51f3474da1 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -53,6 +53,8 @@ pub mod defaults { pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = super::ConfigurableSemaphore::DEFAULT_INITIAL.get(); + pub const DEFAULT_TESTING_MODE: bool = false; + /// /// Default built-in configuration file. /// @@ -75,6 +77,8 @@ pub mod defaults { #concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}' +testing_mode = false + # [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -143,6 +147,9 @@ pub struct PageServerConf { /// Number of concurrent [`Tenant::gather_size_inputs`] allowed. pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore, + + /// Enables failpoint support and extra mgmt APIs useful for testing. + pub testing_mode: bool, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -222,6 +229,8 @@ struct PageServerConfigBuilder { log_format: BuilderValue, concurrent_tenant_size_logical_size_queries: BuilderValue, + + testing_mode: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -252,6 +261,8 @@ impl Default for PageServerConfigBuilder { log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()), + + testing_mode: Set(DEFAULT_TESTING_MODE), } } } @@ -332,6 +343,10 @@ impl PageServerConfigBuilder { self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u); } + pub fn testing_mode(&mut self, testing_mode: bool) { + self.testing_mode = BuilderValue::Set(testing_mode); + } + pub fn build(self) -> anyhow::Result { Ok(PageServerConf { listen_pg_addr: self @@ -380,6 +395,7 @@ impl PageServerConfigBuilder { .ok_or(anyhow!( "missing concurrent_tenant_size_logical_size_queries" ))?, + testing_mode: self.testing_mode.ok_or(anyhow!("missing testing_mode"))?, }) } } @@ -560,6 +576,7 @@ impl PageServerConf { let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?; ConfigurableSemaphore::new(permits) }), + "testing_mode" => builder.testing_mode(parse_toml_bool(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -681,6 +698,7 @@ impl PageServerConf { broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), + testing_mode: true, } } } @@ -694,6 +712,12 @@ fn parse_toml_string(name: &str, item: &Item) -> Result { Ok(s.to_string()) } +fn parse_toml_bool(name: &str, item: &Item) -> Result { + Ok(item + .as_bool() + .with_context(|| format!("configure option {name} is not a boolean"))?) +} + fn parse_toml_u64(name: &str, item: &Item) -> Result { // A toml integer is signed, so it cannot represent the full range of an u64. That's OK // for our use, though. @@ -870,6 +894,7 @@ log_format = 'json' broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), + testing_mode: defaults::DEFAULT_TESTING_MODE, }, "Correct defaults should be used when no config values are provided" ); @@ -916,6 +941,7 @@ log_format = 'json' broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), log_format: LogFormat::Json, concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), + testing_mode: defaults::DEFAULT_TESTING_MODE, }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 0ef555c4aa..8c853ec0c4 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -7,12 +7,14 @@ use remote_storage::GenericRemoteStorage; use tracing::*; use super::models::{ - LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, TenantConfigRequest, - TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo, + ConfigureFailpointsRequest, LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, + TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, + TimelineCreateRequest, TimelineGcRequest, TimelineInfo, }; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::tenant::Timeline; use crate::tenant_config::TenantConfOpt; +use crate::CheckpointConfig; use crate::{config::PageServerConf, tenant_mgr}; use utils::{ auth::JwtAuth, @@ -27,12 +29,6 @@ use utils::{ lsn::Lsn, }; -// Imports only used for testing APIs -#[cfg(feature = "testing")] -use super::models::{ConfigureFailpointsRequest, TimelineGcRequest}; -#[cfg(feature = "testing")] -use crate::CheckpointConfig; - struct State { conf: &'static PageServerConf, auth: Option>, @@ -695,7 +691,6 @@ async fn tenant_config_handler(mut request: Request) -> Result) -> Result, ApiError> { if !fail::has_failpoints() { return Err(ApiError::BadRequest(anyhow!( @@ -729,7 +724,6 @@ async fn failpoints_handler(mut request: Request) -> Result } // Run GC immediately on given timeline. -#[cfg(feature = "testing")] async fn timeline_gc_handler(mut request: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -748,7 +742,6 @@ async fn timeline_gc_handler(mut request: Request) -> Result) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -769,7 +762,6 @@ async fn timeline_compact_handler(request: Request) -> Result) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -814,22 +806,26 @@ pub fn make_router( })) } + // A wrapper around a handler function that returns an error if the server + // was not configured with testing_mode enabled. This is used to gate API + // functions that should only be used in tests, never in production. macro_rules! testing_api { ($handler_desc:literal, $handler:path $(,)?) => {{ - #[cfg(not(feature = "testing"))] - async fn cfg_disabled(_req: Request) -> Result, ApiError> { - Err(ApiError::BadRequest(anyhow!(concat!( - "Cannot ", - $handler_desc, - " because pageserver was compiled without testing APIs", - )))) + use futures::FutureExt; + |req: Request| { + if conf.testing_mode { + $handler(req).left_future() + } else { + async { + Err(ApiError::BadRequest(anyhow!(concat!( + "Cannot ", + $handler_desc, + " because pageserver was configured without testing APIs", + )))) + } + .right_future() + } } - - #[cfg(feature = "testing")] - let handler = $handler; - #[cfg(not(feature = "testing"))] - let handler = cfg_disabled; - handler }}; } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index eafcaa88d9..ea6f709456 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -148,6 +148,28 @@ pub fn is_uninit_mark(path: &Path) -> bool { } } +/// +/// Wrapper around fail::fail_point! macro that returns quickly if testing_mode was +/// disabled in the pageserver config. Also enabled in unit tests. +/// +/// fail::fail_point! is fairly quick, but it does acquire an RwLock and perform a HashMap +/// lookup. This macro is hopefully cheap enough that we don't need to worry about the +/// overhead even in production, and even if the macro is used in hot spots. (This check +/// compiles to two cmp instructions; get_unchecked() would shrink it to one.) +/// +#[macro_export] +macro_rules! fail_point { + ($($name:expr),*) => {{ + if cfg!(test) || *crate::TESTING_MODE.get().expect("testing_mode not initialized") { + fail::fail_point!($($name), *) + } + }}; +} + +/// This is set early in the pageserver startup, from the "testing_mode" setting in the +/// config file. +pub static TESTING_MODE: once_cell::sync::OnceCell = once_cell::sync::OnceCell::new(); + #[cfg(test)] mod backoff_defaults_tests { use super::*; diff --git a/pageserver/src/storage_sync2/delete.rs b/pageserver/src/storage_sync2/delete.rs index 9f6732fbff..cdba17062e 100644 --- a/pageserver/src/storage_sync2/delete.rs +++ b/pageserver/src/storage_sync2/delete.rs @@ -12,7 +12,7 @@ pub(super) async fn delete_layer<'a>( storage: &'a GenericRemoteStorage, local_layer_path: &'a Path, ) -> anyhow::Result<()> { - fail::fail_point!("before-delete-layer", |_| { + crate::fail_point!("before-delete-layer", |_| { anyhow::bail!("failpoint before-delete-layer") }); debug!("Deleting layer from remote storage: {local_layer_path:?}",); diff --git a/pageserver/src/storage_sync2/download.rs b/pageserver/src/storage_sync2/download.rs index 18a6ac0179..1f4c2f3090 100644 --- a/pageserver/src/storage_sync2/download.rs +++ b/pageserver/src/storage_sync2/download.rs @@ -97,7 +97,7 @@ pub async fn download_layer_file<'a>( })?; drop(destination_file); - fail::fail_point!("remote-storage-download-pre-rename", |_| { + crate::fail_point!("remote-storage-download-pre-rename", |_| { bail!("remote-storage-download-pre-rename failpoint triggered") }); diff --git a/pageserver/src/storage_sync2/upload.rs b/pageserver/src/storage_sync2/upload.rs index 57a524a22d..84a7415e48 100644 --- a/pageserver/src/storage_sync2/upload.rs +++ b/pageserver/src/storage_sync2/upload.rs @@ -1,12 +1,12 @@ //! Helper functions to upload files to remote storage with a RemoteStorage use anyhow::{bail, Context}; -use fail::fail_point; use std::path::Path; use tokio::fs; use super::index::IndexPart; use crate::config::PageServerConf; +use crate::fail_point; use crate::storage_sync::LayerFileMetadata; use remote_storage::GenericRemoteStorage; use utils::id::{TenantId, TimelineId}; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index a7601ba2a7..d323ee2b88 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -46,6 +46,7 @@ use std::time::{Duration, Instant}; use self::metadata::TimelineMetadata; use crate::config::PageServerConf; +use crate::fail_point; use crate::import_datadir; use crate::is_uninit_mark; use crate::metrics::{remove_tenant_metrics, STORAGE_TIME}; @@ -255,7 +256,7 @@ impl UninitializedTimeline<'_> { // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock raw_timeline.maybe_spawn_flush_loop(); - fail::fail_point!("before-checkpoint-new-timeline", |_| { + fail_point!("before-checkpoint-new-timeline", |_| { bail!("failpoint before-checkpoint-new-timeline"); }); @@ -2098,7 +2099,7 @@ impl Tenant { // Thus spawn flush loop manually and skip flush_loop setup in initialize_with_lock unfinished_timeline.maybe_spawn_flush_loop(); - fail::fail_point!("before-checkpoint-new-timeline", |_| { + fail_point!("before-checkpoint-new-timeline", |_| { anyhow::bail!("failpoint before-checkpoint-new-timeline"); }); @@ -2192,7 +2193,7 @@ impl Tenant { .context("Failed to create timeline data structure")?; crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?; - fail::fail_point!("after-timeline-uninit-mark-creation", |_| { + fail_point!("after-timeline-uninit-mark-creation", |_| { anyhow::bail!("failpoint after-timeline-uninit-mark-creation"); }); @@ -2381,7 +2382,7 @@ fn try_create_target_tenant_dir( temporary_tenant_timelines_dir.display() ) })?; - fail::fail_point!("tenant-creation-before-tmp-rename", |_| { + fail_point!("tenant-creation-before-tmp-rename", |_| { anyhow::bail!("failpoint tenant-creation-before-tmp-rename"); }); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index ec8049bcea..542756b8dc 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2,7 +2,6 @@ use anyhow::{anyhow, bail, ensure, Context}; use bytes::Bytes; -use fail::fail_point; use itertools::Itertools; use once_cell::sync::OnceCell; use pageserver_api::models::TimelineState; @@ -19,6 +18,7 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, MutexGuard, RwLock}; use std::time::{Duration, Instant, SystemTime}; +use crate::fail_point; use crate::storage_sync::index::IndexPart; use crate::storage_sync::RemoteTimelineClient; use crate::tenant::{ diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index f4f1eba717..1cfd33b3ab 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -12,16 +12,19 @@ use once_cell::sync::Lazy; use tokio::sync::RwLock; use tracing::*; +use pageserver_api::models::TimelineGcRequest; use remote_storage::GenericRemoteStorage; use utils::crashsafe; use crate::config::PageServerConf; +use crate::repository::GcResult; use crate::task_mgr::{self, TaskKind}; use crate::tenant::{Tenant, TenantState}; use crate::tenant_config::TenantConfOpt; use crate::IGNORED_TENANT_FILE_NAME; use utils::fs_ext::PathExt; +use utils::http::error::ApiError; use utils::id::{TenantId, TimelineId}; static TENANTS: Lazy>>> = @@ -460,13 +463,6 @@ where } } -#[cfg(feature = "testing")] -use { - crate::repository::GcResult, pageserver_api::models::TimelineGcRequest, - utils::http::error::ApiError, -}; - -#[cfg(feature = "testing")] pub async fn immediate_gc( tenant_id: TenantId, timeline_id: TimelineId, @@ -494,7 +490,7 @@ pub async fn immediate_gc( &format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"), false, async move { - fail::fail_point!("immediate_gc_task_pre"); + crate::fail_point!("immediate_gc_task_pre"); let result = tenant .gc_iteration(Some(timeline_id), gc_horizon, pitr, true) .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id)) diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index cf2a99f1b5..a5ea091a4d 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -9,7 +9,6 @@ use std::{ use anyhow::{bail, ensure, Context}; use bytes::BytesMut; use chrono::{NaiveDateTime, Utc}; -use fail::fail_point; use futures::StreamExt; use postgres::{SimpleQueryMessage, SimpleQueryRow}; use postgres_ffi::v14::xlog_utils::normalize_lsn; @@ -20,6 +19,7 @@ use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; use tracing::{debug, error, info, trace, warn}; +use crate::fail_point; use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate}; use crate::{ task_mgr, diff --git a/run_clippy.sh b/run_clippy.sh index bf770432d0..9feb8de4ea 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -13,7 +13,7 @@ # avoid running regular linting script that checks every feature. if [[ "$OSTYPE" == "darwin"* ]]; then # no extra features to test currently, add more here when needed - cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings + cargo clippy --locked --all --all-targets -- -A unknown_lints -D warnings else # * `-A unknown_lints` – do not warn about unknown lint suppressions # that people with newer toolchains might use diff --git a/test_runner/README.md b/test_runner/README.md index e066ac3235..a3603ac957 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -6,9 +6,6 @@ Prerequisites: - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python) - Neon and Postgres binaries - See the root [README.md](/README.md) for build directions - If you want to test tests with test-only APIs, you would need to add `--features testing` to Rust code build commands. - For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags. - Usage example: `cargo build_testing --release` is equivalent to `cargo build --features testing --release` - Tests can be run from the git tree; or see the environment variables below to run from other directories. - The neon git repo, including the postgres submodule diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 6fae448794..0df0e62d63 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -587,6 +587,7 @@ class NeonEnvBuilder: auth_enabled: bool = False, rust_log_override: Optional[str] = None, default_branch_name: str = DEFAULT_BRANCH_NAME, + testing_mode: bool = True, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -608,6 +609,7 @@ class NeonEnvBuilder: self.neon_binpath = neon_binpath self.pg_distrib_dir = pg_distrib_dir self.pg_version = pg_version + self.testing_mode = testing_mode def init(self) -> NeonEnv: # Cannot create more than one environment from one builder @@ -858,6 +860,7 @@ class NeonEnv: http=self.port_distributor.get_port(), ) pageserver_auth_type = "NeonJWT" if config.auth_enabled else "Trust" + pageserver_testing_mode = "true" if config.testing_mode else "false" toml += textwrap.dedent( f""" @@ -866,6 +869,7 @@ class NeonEnv: listen_pg_addr = 'localhost:{pageserver_port.pg}' listen_http_addr = 'localhost:{pageserver_port.http}' auth_type = '{pageserver_auth_type}' + testing_mode = {pageserver_testing_mode} """ ) @@ -978,6 +982,10 @@ def _shared_simple_env( pg_distrib_dir=pg_distrib_dir, pg_version=pg_version, run_id=run_id, + # Disable failpoint support. Failpoints could have unexpected consequences + # when the pageserver is shared by concurrent tests. Also, it might affect + # performance, and we use the shared simple env in performance tests. + testing_mode=False, ) as builder: env = builder.init_start() @@ -1048,11 +1056,10 @@ class PageserverApiException(Exception): class PageserverHttpClient(requests.Session): - def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None): + def __init__(self, port: int, auth_token: Optional[str] = None): super().__init__() self.port = port self.auth_token = auth_token - self.is_testing_enabled_or_skip = is_testing_enabled_or_skip if auth_token is not None: self.headers["Authorization"] = f"Bearer {auth_token}" @@ -1071,8 +1078,6 @@ class PageserverHttpClient(requests.Session): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): - self.is_testing_enabled_or_skip() - if isinstance(config_strings, tuple): pairs = [config_strings] else: @@ -1212,8 +1217,6 @@ class PageserverHttpClient(requests.Session): def timeline_gc( self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int] ) -> dict[str, Any]: - self.is_testing_enabled_or_skip() - log.info( f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}" ) @@ -1229,8 +1232,6 @@ class PageserverHttpClient(requests.Session): return res_json def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId): - self.is_testing_enabled_or_skip() - log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact" @@ -1254,8 +1255,6 @@ class PageserverHttpClient(requests.Session): return res_json def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): - self.is_testing_enabled_or_skip() - log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint" @@ -1815,10 +1814,6 @@ class NeonPageserver(PgProtocol): ): self.stop(immediate=True) - def is_testing_enabled_or_skip(self): - if '"testing"' not in self.version: - pytest.skip("pageserver was built without 'testing' feature") - def is_profiling_enabled_or_skip(self): if '"profiling"' not in self.version: pytest.skip("pageserver was built without 'profiling' feature") @@ -1827,7 +1822,6 @@ class NeonPageserver(PgProtocol): return PageserverHttpClient( port=self.service_port.http, auth_token=auth_token, - is_testing_enabled_or_skip=self.is_testing_enabled_or_skip, ) def assert_no_errors(self): diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index 725612853a..d9805193ac 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -3,7 +3,7 @@ First make a release build. The profiling flag is optional, used only for tests that generate flame graphs. The `-s` flag just silences a lot of output, and makes it easier to see if you have compile errors without scrolling up. -`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing,profiling" make -s -j8` +`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=profiling" make -s -j8` NOTE: the `profiling` flag only works on linux because we use linux-specific libc APIs like `libc::timer_t`. diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 6b3324b7a7..83db665efd 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -327,7 +327,6 @@ def check_neon_works( auth_token = snapshot_config["pageserver"]["auth_token"] pageserver_http = PageserverHttpClient( port=pageserver_port, - is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled auth_token=auth_token, ) diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 1e93958e98..1888862b3f 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -13,7 +13,6 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" env = neon_env_builder.init() - env.pageserver.is_testing_enabled_or_skip() neon_env_builder.start() diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index c4b3b28f34..87b557e2cc 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -58,7 +58,6 @@ def new_pageserver_service( pageserver_client = PageserverHttpClient( port=http_port, auth_token=None, - is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled ) try: pageserver_process = start_in_background( @@ -360,7 +359,6 @@ def test_tenant_relocation( new_pageserver_http = PageserverHttpClient( port=new_pageserver_http_port, auth_token=None, - is_testing_enabled_or_skip=env.pageserver.is_testing_enabled_or_skip, ) with new_pageserver_service(