From 32a12783fde3aeb246457ae79b18dc00f85f8896 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 14 May 2025 18:30:21 +0200 Subject: [PATCH] pageserver: batching & concurrent IO: update binary-built-in defaults; reduce CI matrix (#11923) Use the current production config for batching & concurrent IO. Remove the permutation testing for unit tests from CI. (The pageserver unit test matrix takes ~10min for debug builds). Drive-by-fix use of `if cfg!(test)` inside crate `pageserver_api`. It is ineffective for early-enabling new defaults for pageserver unit tests only. The reason is that the `test` cfg is only set for the crate under test but not its dependencies. So, `cargo test -p pageserver` will build `pageserver_api` with `cfg!(test) == false`. Resort to checking for feature flag `testing` instead, since all our unit tests are run with `--feature testing`. refs - `scattered-lsn` batching has been implemented and rolled out in all envs, cf https://github.com/neondatabase/neon/issues/10765 - preliminary for https://github.com/neondatabase/neon/pull/10466 - epic https://github.com/neondatabase/neon/issues/9377 - epic https://github.com/neondatabase/neon/issues/9378 - drive-by fix https://neondb.slack.com/archives/C0277TKAJCA/p1746821515504219 --- .github/workflows/_build-and-test-locally.yml | 22 +++++++------------ .github/workflows/build_and_test.yml | 2 -- libs/pageserver_api/src/config.rs | 20 +++++------------ libs/pageserver_api/src/models.rs | 11 +--------- libs/utils/src/tracing_span_assert.rs | 4 ++-- 5 files changed, 17 insertions(+), 42 deletions(-) diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 7cede309f3..663afa2c8b 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -279,18 +279,14 @@ jobs: # run all non-pageserver tests ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)' - # run pageserver tests with different settings - for get_vectored_concurrent_io in sequential sidecar-task; do - for io_engine in std-fs tokio-epoll-uring ; do - for io_mode in buffered direct direct-rw ; do - NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \ - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \ - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE=$io_mode \ - ${cov_prefix} \ - cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' - done - done - done + # run pageserver tests + # (When developing new pageserver features gated by config fields, we commonly make the rust + # unit tests sensitive to an environment variable NEON_PAGESERVER_UNIT_TEST_FEATURENAME. + # Then run the nextest invocation below for all relevant combinations. Singling out the + # pageserver tests from non-pageserver tests cuts down the time it takes for this CI step.) + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=tokio-epoll-uring \ + ${cov_prefix} \ + cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty @@ -405,8 +401,6 @@ jobs: CHECK_ONDISK_DATA_COMPATIBILITY: nonempty BUILD_TAG: ${{ inputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task - PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} # Temporary disable this step until we figure out why it's so flaky diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e0995218f9..6b19f6ef01 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -323,8 +323,6 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task - PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw SYNC_BETWEEN_TESTS: true # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 5b0c13dd89..7e0bb7dc57 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -639,23 +639,15 @@ impl Default for ConfigToml { tenant_config: TenantConfigToml::default(), no_sync: None, wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL, - page_service_pipelining: if !cfg!(test) { - PageServicePipeliningConfig::Serial - } else { - // Do not turn this into the default until scattered reads have been - // validated and rolled-out fully. - PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { + page_service_pipelining: PageServicePipeliningConfig::Pipelined( + PageServicePipeliningConfigPipelined { max_batch_size: NonZeroUsize::new(32).unwrap(), execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures, batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn, - }) - }, - get_vectored_concurrent_io: if !cfg!(test) { - GetVectoredConcurrentIo::Sequential - } else { - GetVectoredConcurrentIo::SidecarTask - }, - enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") { + }, + ), + get_vectored_concurrent_io: GetVectoredConcurrentIo::SidecarTask, + enable_read_path_debugging: if cfg!(feature = "testing") { Some(true) } else { None diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 5fcdefba66..89d531d671 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1803,7 +1803,6 @@ pub struct TopTenantShardsResponse { } pub mod virtual_file { - use std::sync::LazyLock; #[derive( Copy, @@ -1851,15 +1850,7 @@ pub mod virtual_file { impl IoMode { pub fn preferred() -> Self { - // The default behavior when running Rust unit tests without any further - // flags is to use the newest behavior (DirectRw). - // The CI uses the environment variable to unit tests for all different modes. - // NB: the Python regression & perf tests have their own defaults management - // that writes pageserver.toml; they do not use this variable. - static ENV_OVERRIDE: LazyLock> = LazyLock::new(|| { - utils::env::var_serde_json_string("NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE") - }); - ENV_OVERRIDE.unwrap_or(IoMode::DirectRw) + IoMode::DirectRw } } diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs index 3d15e08400..857d98b644 100644 --- a/libs/utils/src/tracing_span_assert.rs +++ b/libs/utils/src/tracing_span_assert.rs @@ -127,12 +127,12 @@ macro_rules! __check_fields_present { match check_fields_present0($extractors) { Ok(FoundEverything) => Ok(()), - Ok(Unconfigured) if cfg!(test) => { + Ok(Unconfigured) if cfg!(feature = "testing") => { // allow unconfigured in tests Ok(()) }, Ok(Unconfigured) => { - panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer") + panic!(r#"utils::tracing_span_assert: outside of #[cfg(feature = "testing")] expected tracing to be configured with tracing_error::ErrorLayer"#) }, Err(missing) => Err(missing) }