From 2b041964b3648ea9b1453474180535ceb5ebae19 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 17 Apr 2025 17:53:10 +0200
Subject: [PATCH] cover direct IO + concurrent IO in unit, regression & perf
 tests (#11585)

This mirrors the production config.

Thread that discusses the merits of this:
- https://neondb.slack.com/archives/C033RQ5SPDH/p1744742010740569

# Refs
- context
https://neondb.slack.com/archives/C04BLQ4LW7K/p1744724844844589?thread_ts=1744705831.014169&cid=C04BLQ4LW7K
- prep for https://github.com/neondatabase/neon/pull/11558 which adds
new io mode `direct-rw`

# Impact on CI turnaround time

Spot-checking impact on CI timings

- Baseline: [some recent main
commit](https://github.com/neondatabase/neon/actions/runs/14471549758/job/40587837475)
- Comparison: [this
commit](https://github.com/neondatabase/neon/actions/runs/14471945087/job/40589613274)
in this PR here

Impact on CI turnaround time

- Regression tests:
  - x64: very minor, sometimes better; likely in the noise
  - arm64: substantial  30min => 40min
- Benchmarks (x86 only I think): very minor; noise seems higher than
regress tests

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Alex Chi Z. <4198311+skyzh@users.noreply.github.com>
Co-authored-by: Peter Bendel <peterbendel@neon.tech>
Co-authored-by: Alex Chi Z <chi@neon.tech>
---
 .github/workflows/_build-and-test-locally.yml | 12 +++++---
 .github/workflows/build_and_test.yml          |  2 ++
 Cargo.lock                                    |  1 +
 libs/pageserver_api/Cargo.toml                |  1 +
 libs/pageserver_api/src/models.rs             | 30 +++++++++++++++++--
 pageserver/src/virtual_file.rs                |  3 +-
 test_runner/regress/test_compaction.py        |  4 ++-
 7 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 8b1314f95b..318e69d8a7 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -272,10 +272,13 @@ jobs:
           # run pageserver tests with different settings
           for get_vectored_concurrent_io in sequential sidecar-task; do
             for io_engine in std-fs tokio-epoll-uring ; do
-              NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
-                NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
-                ${cov_prefix} \
-                cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+                for io_mode in buffered direct direct-rw ; do
+                  NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
+                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
+                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOMODE=$io_mode \
+                  ${cov_prefix} \
+                  cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+              done
             done
           done
 
@@ -392,6 +395,7 @@ jobs:
           BUILD_TAG: ${{ inputs.build-tag }}
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
           PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
+          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct
           USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
 
       # Temporary disable this step until we figure out why it's so flaky
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 80c4511b36..e875cb327f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -323,6 +323,8 @@ jobs:
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
+          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct
           SYNC_BETWEEN_TESTS: true
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
diff --git a/Cargo.lock b/Cargo.lock
index 7ab9378853..870401e7f9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4352,6 +4352,7 @@ dependencies = [
  "humantime-serde",
  "itertools 0.10.5",
  "nix 0.27.1",
+ "once_cell",
  "postgres_backend",
  "postgres_ffi",
  "rand 0.8.5",
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 688e9de6e7..25f29b8ecd 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -35,6 +35,7 @@ nix = {workspace = true, optional = true}
 reqwest.workspace = true
 rand.workspace = true
 tracing-utils.workspace = true
+once_cell.workspace = true
 
 [dev-dependencies]
 bincode.workspace = true
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ea5456e04b..e367db614f 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1817,8 +1817,34 @@ pub mod virtual_file {
     }
 
     impl IoMode {
-        pub const fn preferred() -> Self {
-            Self::Buffered
+        pub fn preferred() -> Self {
+            // The default behavior when running Rust unit tests without any further
+            // flags is to use the newest behavior if available on the platform (Direct).
+            // The CI uses the following environment variable to unit tests for all
+            // different modes.
+            // NB: the Python regression & perf tests have their own defaults management
+            // that writes pageserver.toml; they do not use this variable.
+            if cfg!(test) {
+                use once_cell::sync::Lazy;
+                static CACHED: Lazy<IoMode> = Lazy::new(|| {
+                    utils::env::var_serde_json_string(
+                        "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE",
+                    )
+                    .unwrap_or({
+                        #[cfg(target_os = "linux")]
+                        {
+                            IoMode::Direct
+                        }
+                        #[cfg(not(target_os = "linux"))]
+                        {
+                            IoMode::Buffered
+                        }
+                    })
+                });
+                *CACHED
+            } else {
+                IoMode::Buffered
+            }
         }
     }
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index cd3d897423..45cd0f469b 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -1366,7 +1366,8 @@ pub(crate) type IoBuffer = AlignedBuffer<ConstAlign<{ get_io_buffer_alignment()
 pub(crate) type IoPageSlice<'a> =
     AlignedSlice<'a, PAGE_SZ, ConstAlign<{ get_io_buffer_alignment() }>>;
 
-static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8);
+static IO_MODE: once_cell::sync::Lazy<AtomicU8> =
+    once_cell::sync::Lazy::new(|| AtomicU8::new(IoMode::preferred() as u8));
 
 pub(crate) fn set_io_mode(mode: IoMode) {
     IO_MODE.store(mode as u8, std::sync::atomic::Ordering::Relaxed);
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 001ddcdcb0..53edf9f79e 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -72,6 +72,7 @@ PREEMPT_GC_COMPACTION_TENANT_CONF = {
     "wal_receiver_protocol",
     [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
 )
+@pytest.mark.timeout(900)
 def test_pageserver_compaction_smoke(
     neon_env_builder: NeonEnvBuilder,
     wal_receiver_protocol: PageserverWalReceiverProtocol,
@@ -190,6 +191,7 @@ def test_pageserver_compaction_preempt(
 
 
 @skip_in_debug_build("only run with release build")
+@pytest.mark.timeout(600)
 def test_pageserver_gc_compaction_preempt(
     neon_env_builder: NeonEnvBuilder,
 ):
@@ -227,7 +229,7 @@ def test_pageserver_gc_compaction_preempt(
 
 
 @skip_in_debug_build("only run with release build")
-@pytest.mark.timeout(900)  # This test is slow with sanitizers enabled, especially on ARM
+@pytest.mark.timeout(600)  # This test is slow with sanitizers enabled, especially on ARM
 @pytest.mark.parametrize(
     "with_branches",
     ["with_branches", "no_branches"],