From 2b041964b3648ea9b1453474180535ceb5ebae19 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 17 Apr 2025 17:53:10 +0200 Subject: [PATCH] cover direct IO + concurrent IO in unit, regression & perf tests (#11585) This mirrors the production config. Thread that discusses the merits of this: - https://neondb.slack.com/archives/C033RQ5SPDH/p1744742010740569 # Refs - context https://neondb.slack.com/archives/C04BLQ4LW7K/p1744724844844589?thread_ts=1744705831.014169&cid=C04BLQ4LW7K - prep for https://github.com/neondatabase/neon/pull/11558 which adds new io mode `direct-rw` # Impact on CI turnaround time Spot-checking impact on CI timings - Baseline: [some recent main commit](https://github.com/neondatabase/neon/actions/runs/14471549758/job/40587837475) - Comparison: [this commit](https://github.com/neondatabase/neon/actions/runs/14471945087/job/40589613274) in this PR here Impact on CI turnaround time - Regression tests: - x64: very minor, sometimes better; likely in the noise - arm64: substantial 30min => 40min - Benchmarks (x86 only I think): very minor; noise seems higher than regress tests --------- Signed-off-by: Alex Chi Z Co-authored-by: Alex Chi Z. <4198311+skyzh@users.noreply.github.com> Co-authored-by: Peter Bendel Co-authored-by: Alex Chi Z --- .github/workflows/_build-and-test-locally.yml | 12 +++++--- .github/workflows/build_and_test.yml | 2 ++ Cargo.lock | 1 + libs/pageserver_api/Cargo.toml | 1 + libs/pageserver_api/src/models.rs | 30 +++++++++++++++++-- pageserver/src/virtual_file.rs | 3 +- test_runner/regress/test_compaction.py | 4 ++- 7 files changed, 45 insertions(+), 8 deletions(-) diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 8b1314f95b..318e69d8a7 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -272,10 +272,13 @@ jobs: # run pageserver tests with different settings for get_vectored_concurrent_io in sequential sidecar-task; do for io_engine in std-fs tokio-epoll-uring ; do - NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \ - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \ - ${cov_prefix} \ - cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' + for io_mode in buffered direct direct-rw ; do + NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \ + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \ + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOMODE=$io_mode \ + ${cov_prefix} \ + cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' + done done done @@ -392,6 +395,7 @@ jobs: BUILD_TAG: ${{ inputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task + PAGESERVER_VIRTUAL_FILE_IO_MODE: direct USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} # Temporary disable this step until we figure out why it's so flaky diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 80c4511b36..e875cb327f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -323,6 +323,8 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task + PAGESERVER_VIRTUAL_FILE_IO_MODE: direct SYNC_BETWEEN_TESTS: true # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones diff --git a/Cargo.lock b/Cargo.lock index 7ab9378853..870401e7f9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4352,6 +4352,7 @@ dependencies = [ "humantime-serde", "itertools 0.10.5", "nix 0.27.1", + "once_cell", "postgres_backend", "postgres_ffi", "rand 0.8.5", diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 688e9de6e7..25f29b8ecd 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -35,6 +35,7 @@ nix = {workspace = true, optional = true} reqwest.workspace = true rand.workspace = true tracing-utils.workspace = true +once_cell.workspace = true [dev-dependencies] bincode.workspace = true diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ea5456e04b..e367db614f 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1817,8 +1817,34 @@ pub mod virtual_file { } impl IoMode { - pub const fn preferred() -> Self { - Self::Buffered + pub fn preferred() -> Self { + // The default behavior when running Rust unit tests without any further + // flags is to use the newest behavior if available on the platform (Direct). + // The CI uses the following environment variable to unit tests for all + // different modes. + // NB: the Python regression & perf tests have their own defaults management + // that writes pageserver.toml; they do not use this variable. + if cfg!(test) { + use once_cell::sync::Lazy; + static CACHED: Lazy = Lazy::new(|| { + utils::env::var_serde_json_string( + "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE", + ) + .unwrap_or({ + #[cfg(target_os = "linux")] + { + IoMode::Direct + } + #[cfg(not(target_os = "linux"))] + { + IoMode::Buffered + } + }) + }); + *CACHED + } else { + IoMode::Buffered + } } } diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index cd3d897423..45cd0f469b 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -1366,7 +1366,8 @@ pub(crate) type IoBuffer = AlignedBuffer = AlignedSlice<'a, PAGE_SZ, ConstAlign<{ get_io_buffer_alignment() }>>; -static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8); +static IO_MODE: once_cell::sync::Lazy = + once_cell::sync::Lazy::new(|| AtomicU8::new(IoMode::preferred() as u8)); pub(crate) fn set_io_mode(mode: IoMode) { IO_MODE.store(mode as u8, std::sync::atomic::Ordering::Relaxed); diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 001ddcdcb0..53edf9f79e 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -72,6 +72,7 @@ PREEMPT_GC_COMPACTION_TENANT_CONF = { "wal_receiver_protocol", [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED], ) +@pytest.mark.timeout(900) def test_pageserver_compaction_smoke( neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol, @@ -190,6 +191,7 @@ def test_pageserver_compaction_preempt( @skip_in_debug_build("only run with release build") +@pytest.mark.timeout(600) def test_pageserver_gc_compaction_preempt( neon_env_builder: NeonEnvBuilder, ): @@ -227,7 +229,7 @@ def test_pageserver_gc_compaction_preempt( @skip_in_debug_build("only run with release build") -@pytest.mark.timeout(900) # This test is slow with sanitizers enabled, especially on ARM +@pytest.mark.timeout(600) # This test is slow with sanitizers enabled, especially on ARM @pytest.mark.parametrize( "with_branches", ["with_branches", "no_branches"],