From 48b22bd057070f10fda9e4c8a7f5326441617b22 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 20 Mar 2024 15:01:14 +0000
Subject: [PATCH 1/4] walredo: better benchmark

---
 pageserver/benches/bench_walredo.rs | 317 +++++++++++++---------------
 1 file changed, 144 insertions(+), 173 deletions(-)

diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 47c8bd75c6..72c0f2c6c0 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -1,160 +1,128 @@
-//! Simple benchmarking around walredo.
+//! Quantify a single walredo manager's throughput under N concurrent callers.
 //!
-//! Right now they hope to just set a baseline. Later we can try to expand into latency and
-//! throughput after figuring out the coordinated omission problems below.
+//! The benchmark implementation ([`bench_impl`]) is parametrized by
+//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
+//! - `n_redos` => number of times the benchmark shell execute the `redo_work`
+//! - `nclients` => number of clients (more on this shortly).
 //!
-//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
-//! logging what happens when a sequential scan is requested on a small table, then picking out two
-//! suitable from logs.
+//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters.
+//! It spawns `nclients` times [`client`] tokio tasks.
+//! Each task executes the `redo_work` `n_redos/nclients` times.
 //!
+//! We exercise the following combinations:
+//! - `redo_work = short / medium``
+//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
 //!
-//! Reference data (git blame to see commit) on an i3en.3xlarge
-// ```text
-//! short/short/1           time:   [39.175 µs 39.348 µs 39.536 µs]
-//! short/short/2           time:   [51.227 µs 51.487 µs 51.755 µs]
-//! short/short/4           time:   [76.048 µs 76.362 µs 76.674 µs]
-//! short/short/8           time:   [128.94 µs 129.82 µs 130.74 µs]
-//! short/short/16          time:   [227.84 µs 229.00 µs 230.28 µs]
-//! short/short/32          time:   [455.97 µs 457.81 µs 459.90 µs]
-//! short/short/64          time:   [902.46 µs 904.84 µs 907.32 µs]
-//! short/short/128         time:   [1.7416 ms 1.7487 ms 1.7561 ms]
-//! ``
-
-use std::sync::Arc;
+//! We let `criterion` determine the `n_redos` using `iter_custom`.
+//! The idea is that for each `(redo_work, nclients)` combination,
+//! criterion will run the `bench_impl` multiple times with different `n_redos`.
+//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective.
+//! Criterion will divide that by `n_redos` to compute the "time per iteration".
+//! In our case, "time per iteration" means "time per redo_work execution".
+//!
+//! NB: the way by which `iter_custom` determines the "number of iterations"
+//! is called sampling. Apparently the idea here is to detect outliers.
+//! We're not sure whether the current choice of sampling method makes sense.
+//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples
 
 use bytes::{Buf, Bytes};
-use pageserver::{
-    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
+use criterion::{BenchmarkId, Criterion};
+use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
+use pageserver_api::{key::Key, shard::TenantShardId};
+use std::{
+    sync::Arc,
+    time::{Duration, Instant},
 };
-use pageserver_api::shard::TenantShardId;
-use tokio::task::JoinSet;
+use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};
 
-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+fn bench(c: &mut Criterion) {
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("short");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::short_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
+        }
+    }
 
-fn redo_scenarios(c: &mut Criterion) {
-    // logging should be enabled when adding more inputs, since walredo will only report malformed
-    // input to the stderr.
-    // utils::logging::init(utils::logging::LogFormat::Plain).unwrap();
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("medium");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::medium_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
+        }
+    }
+}
+criterion::criterion_group!(benches, bench);
+criterion::criterion_main!(benches);
 
+// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
+fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
     let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
 
     let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
     let conf = Box::leak(Box::new(conf));
     let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
-    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
-
-    let manager = Arc::new(manager);
-
-    {
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .unwrap();
-        tracing::info!("executing first");
-        rt.block_on(short().execute(&manager)).unwrap();
-        tracing::info!("first executed");
-    }
-
-    let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
-
-    let mut group = c.benchmark_group("short");
-    group.sampling_mode(criterion::SamplingMode::Flat);
-
-    for thread_count in thread_counts {
-        group.bench_with_input(
-            BenchmarkId::new("short", thread_count),
-            &thread_count,
-            |b, thread_count| {
-                add_multithreaded_walredo_requesters(b, *thread_count, &manager, short);
-            },
-        );
-    }
-    drop(group);
-
-    let mut group = c.benchmark_group("medium");
-    group.sampling_mode(criterion::SamplingMode::Flat);
-
-    for thread_count in thread_counts {
-        group.bench_with_input(
-            BenchmarkId::new("medium", thread_count),
-            &thread_count,
-            |b, thread_count| {
-                add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium);
-            },
-        );
-    }
-    drop(group);
-}
-
-/// Sets up a multi-threaded tokio runtime with default worker thread count,
-/// then, spawn `requesters` tasks that repeatedly:
-/// - get input from `input_factor()`
-/// - call `manager.request_redo()` with their input
-///
-/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
-///
-/// Using tokio's default worker thread count means the results will differ on machines
-/// with different core countrs. We don't care about that, the performance will always
-/// be different on different hardware. To compare performance of different software versions,
-/// use the same hardware.
-fn add_multithreaded_walredo_requesters(
-    b: &mut criterion::Bencher,
-    nrequesters: usize,
-    manager: &Arc<PostgresRedoManager>,
-    input_factory: fn() -> Request,
-) {
-    assert_ne!(nrequesters, 0);
-
     let rt = tokio::runtime::Builder::new_multi_thread()
         .enable_all()
         .build()
         .unwrap();
 
-    let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));
+    let start = Arc::new(Barrier::new(nclients as usize));
 
-    let mut requesters = JoinSet::new();
-    for _ in 0..nrequesters {
-        let _entered = rt.enter();
-        let manager = manager.clone();
-        let barrier = barrier.clone();
-        requesters.spawn(async move {
-            loop {
-                let input = input_factory();
-                barrier.wait().await;
-                let page = input.execute(&manager).await.unwrap();
-                assert_eq!(page.remaining(), 8192);
-                barrier.wait().await;
-            }
+    let mut tasks = JoinSet::new();
+
+    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
+    let manager = Arc::new(manager);
+
+    for _ in 0..nclients {
+        rt.block_on(async {
+            tasks.spawn(client(
+                Arc::clone(&manager),
+                Arc::clone(&start),
+                Arc::clone(&redo_work),
+                // divide the amount of work equally among the clients
+                n_redos / nclients,
+            ))
         });
     }
 
-    let do_one_iteration = || {
-        rt.block_on(async {
-            barrier.wait().await;
-            // wait for work to complete
-            barrier.wait().await;
-        })
-    };
-
-    b.iter_batched(
-        || {
-            // warmup
-            do_one_iteration();
-        },
-        |()| {
-            // work loop
-            do_one_iteration();
-        },
-        criterion::BatchSize::PerIteration,
-    );
-
-    rt.block_on(requesters.shutdown());
+    let mut total_wallclock_time = std::time::Duration::from_millis(0);
+    while let Some(res) = rt.block_on(tasks.join_next()) {
+        total_wallclock_time += res.unwrap();
+    }
+    total_wallclock_time
 }
 
-criterion_group!(benches, redo_scenarios);
-criterion_main!(benches);
+async fn client(
+    mgr: Arc<PostgresRedoManager>,
+    start: Arc<Barrier>,
+    redo_work: Arc<Request>,
+    n_redos: u64,
+) -> Duration {
+    start.wait().await;
+    let start = Instant::now();
+    for _ in 0..n_redos {
+        let page = redo_work.execute(&mgr).await.unwrap();
+        assert_eq!(page.remaining(), 8192);
+    }
+    start.elapsed()
+}
 
 macro_rules! lsn {
     ($input:expr) => {{
@@ -166,12 +134,46 @@ macro_rules! lsn {
     }};
 }
 
-/// Short payload, 1132 bytes.
-// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
-// for null bytes.
-#[allow(clippy::octal_escapes)]
-fn short() -> Request {
-    Request {
+/// Simple wrapper around `WalRedoManager::request_redo`.
+///
+/// In benchmarks this is cloned around.
+#[derive(Clone)]
+struct Request {
+    key: Key,
+    lsn: Lsn,
+    base_img: Option<(Lsn, Bytes)>,
+    records: Vec<(Lsn, NeonWalRecord)>,
+    pg_version: u32,
+}
+
+impl Request {
+    async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
+        let Request {
+            key,
+            lsn,
+            base_img,
+            records,
+            pg_version,
+        } = self;
+
+        // TODO: avoid these clones
+        manager
+            .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
+            .await
+    }
+
+    fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
+        let rec = Bytes::from_static(bytes);
+        NeonWalRecord::Postgres { will_init, rec }
+    }
+
+    /// Short payload, 1132 bytes.
+    // pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
+    // for null bytes.
+    #[allow(clippy::octal_escapes)]
+    pub fn short_input() -> Request {
+        let pg_record = Self::pg_record;
+        Request {
         key: Key {
             field1: 0,
             field2: 1663,
@@ -194,13 +196,14 @@ fn short() -> Request {
         ],
         pg_version: 14,
     }
-}
+    }
 
-/// Medium sized payload, serializes as 26393 bytes.
-// see [`short`]
-#[allow(clippy::octal_escapes)]
-fn medium() -> Request {
-    Request {
+    /// Medium sized payload, serializes as 26393 bytes.
+    // see [`short`]
+    #[allow(clippy::octal_escapes)]
+    pub fn medium_input() -> Request {
+        let pg_record = Self::pg_record;
+        Request {
         key: Key {
             field1: 0,
             field2: 1663,
@@ -442,37 +445,5 @@ fn medium() -> Request {
         ],
         pg_version: 14,
     }
-}
-
-fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
-    let rec = Bytes::from_static(bytes);
-    NeonWalRecord::Postgres { will_init, rec }
-}
-
-/// Simple wrapper around `WalRedoManager::request_redo`.
-///
-/// In benchmarks this is cloned around.
-#[derive(Clone)]
-struct Request {
-    key: Key,
-    lsn: Lsn,
-    base_img: Option<(Lsn, Bytes)>,
-    records: Vec<(Lsn, NeonWalRecord)>,
-    pg_version: u32,
-}
-
-impl Request {
-    async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
-        let Request {
-            key,
-            lsn,
-            base_img,
-            records,
-            pg_version,
-        } = self;
-
-        manager
-            .request_redo(key, lsn, base_img, records, pg_version)
-            .await
     }
 }

From 3d16cda846f4e7b8e929c61db13093586dca93d1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 20 Mar 2024 18:03:09 +0200
Subject: [PATCH 2/4] refactor(layer): use detached init (#7152)

The second part of work towards fixing `Layer::keep_resident` so that it
does not need to repair the internal state. #7135 added a nicer API for
initialization. This PR uses it to remove a few indentation levels and
the loop construction. The next PR #7175 will use the refactorings done
in this PR, and always initialize the internal state after a download.

Cc: #5331
---
 pageserver/src/tenant/storage_layer/layer.rs  | 329 +++++++++---------
 .../src/tenant/storage_layer/layer/tests.rs   |   3 +
 2 files changed, 166 insertions(+), 166 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index f37d7e6449..eed423c3e6 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -702,181 +702,132 @@ impl LayerInner {
         allow_download: bool,
         ctx: Option<&RequestContext>,
     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
-        let mut init_permit = None;
+        let (weak, permit) = {
+            let locked = self
+                .inner
+                .get_or_init_detached()
+                .await
+                .map(|mut guard| guard.get_and_upgrade().ok_or(guard));
 
-        loop {
-            let download = move |permit| {
-                async move {
-                    // disable any scheduled but not yet running eviction deletions for this
-                    let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
+            match locked {
+                // this path could had been a RwLock::read
+                Ok(Ok((strong, upgraded))) if !upgraded => return Ok(strong),
+                Ok(Ok((strong, _))) => {
+                    // when upgraded back, the Arc<DownloadedLayer> is still available, but
+                    // previously a `evict_and_wait` was received.
+                    self.wanted_evicted.store(false, Ordering::Relaxed);
 
-                    // no need to make the evict_and_wait wait for the actual download to complete
+                    // error out any `evict_and_wait`
                     drop(self.status.send(Status::Downloaded));
-
-                    let timeline = self
-                        .timeline
-                        .upgrade()
-                        .ok_or_else(|| DownloadError::TimelineShutdown)?;
-
-                    // count cancellations, which currently remain largely unexpected
-                    let init_cancelled =
-                        scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
-
-                    let can_ever_evict = timeline.remote_client.as_ref().is_some();
-
-                    // check if we really need to be downloaded; could have been already downloaded by a
-                    // cancelled previous attempt.
-                    let needs_download = self
-                        .needs_download()
-                        .await
-                        .map_err(DownloadError::PreStatFailed);
-
-                    let needs_download = match needs_download {
-                        Ok(reason) => reason,
-                        Err(e) => {
-                            scopeguard::ScopeGuard::into_inner(init_cancelled);
-                            return Err(e);
-                        }
-                    };
-
-                    let (permit, downloaded) = if let Some(reason) = needs_download {
-                        if let NeedsDownload::NotFile(ft) = reason {
-                            return Err(DownloadError::NotFile(ft));
-                        }
-
-                        // only reset this after we've decided we really need to download. otherwise it'd
-                        // be impossible to mark cancelled downloads for eviction, like one could imagine
-                        // we would like to do for prefetching which was not needed.
-                        self.wanted_evicted.store(false, Ordering::Release);
-
-                        if !can_ever_evict {
-                            scopeguard::ScopeGuard::into_inner(init_cancelled);
-                            return Err(DownloadError::NoRemoteStorage);
-                        }
-
-                        if let Some(ctx) = ctx {
-                            let res = self.check_expected_download(ctx);
-                            if let Err(e) = res {
-                                scopeguard::ScopeGuard::into_inner(init_cancelled);
-                                return Err(e);
-                            }
-                        }
-
-                        if !allow_download {
-                            // this does look weird, but for LayerInner the "downloading" means also changing
-                            // internal once related state ...
-                            scopeguard::ScopeGuard::into_inner(init_cancelled);
-                            return Err(DownloadError::DownloadRequired);
-                        }
-
-                        tracing::info!(%reason, "downloading on-demand");
-
-                        let permit = self.spawn_download_and_wait(timeline, permit).await;
-
-                        let permit = match permit {
-                            Ok(permit) => permit,
-                            Err(e) => {
-                                scopeguard::ScopeGuard::into_inner(init_cancelled);
-                                return Err(e);
-                            }
-                        };
-
-                        (permit, true)
-                    } else {
-                        // the file is present locally, probably by a previous but cancelled call to
-                        // get_or_maybe_download. alternatively we might be running without remote storage.
-                        LAYER_IMPL_METRICS.inc_init_needed_no_download();
-
-                        (permit, false)
-                    };
-
-                    scopeguard::ScopeGuard::into_inner(init_cancelled);
-
-                    if downloaded {
-                        let since_last_eviction = self
-                            .last_evicted_at
-                            .lock()
-                            .unwrap()
-                            .take()
-                            .map(|ts| ts.elapsed());
-
-                        if let Some(since_last_eviction) = since_last_eviction {
-                            LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
-                        }
-                    }
-
-                    let res = Arc::new(DownloadedLayer {
-                        owner: Arc::downgrade(self),
-                        kind: tokio::sync::OnceCell::default(),
-                        version: next_version,
-                    });
-
-                    self.access_stats.record_residence_event(
-                        LayerResidenceStatus::Resident,
-                        LayerResidenceEventReason::ResidenceChange,
-                    );
-
-                    let waiters = self.inner.initializer_count();
-                    if waiters > 0 {
-                        tracing::info!(
-                            waiters,
-                            "completing the on-demand download for other tasks"
-                        );
-                    }
-
-                    Ok((ResidentOrWantedEvicted::Resident(res), permit))
-                }
-                .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
-            };
-
-            if let Some(init_permit) = init_permit.take() {
-                // use the already held initialization permit because it is impossible to hit the
-                // below paths anymore essentially limiting the max loop iterations to 2.
-                let (value, init_permit) = download(init_permit).await?;
-                let mut guard = self.inner.set(value, init_permit);
-                let (strong, _upgraded) = guard
-                    .get_and_upgrade()
-                    .expect("init creates strong reference, we held the init permit");
-                return Ok(strong);
-            }
-
-            let (weak, permit) = {
-                let mut locked = self.inner.get_or_init(download).await?;
-
-                if let Some((strong, upgraded)) = locked.get_and_upgrade() {
-                    if upgraded {
-                        // when upgraded back, the Arc<DownloadedLayer> is still available, but
-                        // previously a `evict_and_wait` was received.
-                        self.wanted_evicted.store(false, Ordering::Relaxed);
-
-                        // error out any `evict_and_wait`
-                        drop(self.status.send(Status::Downloaded));
-                        LAYER_IMPL_METRICS
-                            .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
-                    }
+                    LAYER_IMPL_METRICS
+                        .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
 
                     return Ok(strong);
-                } else {
+                }
+                Ok(Err(mut guard)) => {
                     // path to here: the evict_blocking is stuck on spawn_blocking queue.
                     //
                     // reset the contents, deactivating the eviction and causing a
                     // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
-                    locked.take_and_deinit()
+                    let (weak, permit) = guard.take_and_deinit();
+                    (Some(weak), permit)
                 }
-            };
-
-            // unlock first, then drop the weak, but because upgrade failed, we
-            // know it cannot be a problem.
+                Err(permit) => (None, permit),
+            }
+        };
 
+        if let Some(weak) = weak {
+            // only drop the weak after dropping the heavier_once_cell guard
             assert!(
                 matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
                 "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
             );
-
-            init_permit = Some(permit);
-
-            LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download();
         }
+
+        async move {
+            // disable any scheduled but not yet running eviction deletions for this
+            let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
+
+            // no need to make the evict_and_wait wait for the actual download to complete
+            drop(self.status.send(Status::Downloaded));
+
+            let timeline = self
+                .timeline
+                .upgrade()
+                .ok_or_else(|| DownloadError::TimelineShutdown)?;
+
+            // count cancellations, which currently remain largely unexpected
+            let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+
+            // check if we really need to be downloaded; could have been already downloaded by a
+            // cancelled previous attempt.
+            let needs_download = self
+                .needs_download()
+                .await
+                .map_err(DownloadError::PreStatFailed);
+
+            let needs_download = match needs_download {
+                Ok(reason) => reason,
+                Err(e) => {
+                    scopeguard::ScopeGuard::into_inner(init_cancelled);
+                    return Err(e);
+                }
+            };
+
+            let Some(reason) = needs_download else {
+                scopeguard::ScopeGuard::into_inner(init_cancelled);
+
+                // the file is present locally, probably by a previous but cancelled call to
+                // get_or_maybe_download. alternatively we might be running without remote storage.
+                LAYER_IMPL_METRICS.inc_init_needed_no_download();
+
+                let res = self.initialize_after_layer_is_on_disk(next_version, permit, false);
+                return Ok(res);
+            };
+
+            if let NeedsDownload::NotFile(ft) = reason {
+                scopeguard::ScopeGuard::into_inner(init_cancelled);
+                return Err(DownloadError::NotFile(ft));
+            }
+
+            // only reset this after we've decided we really need to download. otherwise it'd
+            // be impossible to mark cancelled downloads for eviction, like one could imagine
+            // we would like to do for prefetching which was not needed.
+            self.wanted_evicted.store(false, Ordering::Release);
+
+            if timeline.remote_client.as_ref().is_none() {
+                scopeguard::ScopeGuard::into_inner(init_cancelled);
+                return Err(DownloadError::NoRemoteStorage);
+            }
+
+            if let Some(ctx) = ctx {
+                let res = self.check_expected_download(ctx);
+                if let Err(e) = res {
+                    scopeguard::ScopeGuard::into_inner(init_cancelled);
+                    return Err(e);
+                }
+            }
+
+            if !allow_download {
+                // this does look weird, but for LayerInner the "downloading" means also changing
+                // internal once related state ...
+                scopeguard::ScopeGuard::into_inner(init_cancelled);
+                return Err(DownloadError::DownloadRequired);
+            }
+
+            tracing::info!(%reason, "downloading on-demand");
+
+            let permit = self.spawn_download_and_wait(timeline, permit).await;
+
+            scopeguard::ScopeGuard::into_inner(init_cancelled);
+
+            let permit = permit?;
+
+            let res = self.initialize_after_layer_is_on_disk(next_version, permit, true);
+            Ok(res)
+        }
+        .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
+        .await
     }
 
     /// Nag or fail per RequestContext policy
@@ -1026,6 +977,59 @@ impl LayerInner {
         }
     }
 
+    /// Initializes the `Self::inner` to a "resident" state.
+    ///
+    /// Callers are assumed to ensure that the file is actually on disk with `Self::needs_download`
+    /// before calling this method.
+    ///
+    /// If this method is ever made async, it needs to be cancellation safe so that no state
+    /// changes are made before we can write to the OnceCell in non-cancellable fashion.
+    fn initialize_after_layer_is_on_disk(
+        self: &Arc<LayerInner>,
+        next_version: usize,
+        permit: heavier_once_cell::InitPermit,
+        downloaded: bool,
+    ) -> Arc<DownloadedLayer> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
+        if downloaded {
+            let since_last_eviction = self
+                .last_evicted_at
+                .lock()
+                .unwrap()
+                .take()
+                .map(|ts| ts.elapsed());
+            if let Some(since_last_eviction) = since_last_eviction {
+                // FIXME: this will not always be recorded correctly until #6028 (the no
+                // download needed branch above)
+                LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+            }
+        }
+
+        let res = Arc::new(DownloadedLayer {
+            owner: Arc::downgrade(self),
+            kind: tokio::sync::OnceCell::default(),
+            version: next_version,
+        });
+
+        // FIXME: this might now be double-accounted for !downloaded
+        self.access_stats.record_residence_event(
+            LayerResidenceStatus::Resident,
+            LayerResidenceEventReason::ResidenceChange,
+        );
+
+        let waiters = self.inner.initializer_count();
+        if waiters > 0 {
+            tracing::info!(waiters, "completing the on-demand download for other tasks");
+        }
+
+        let value = ResidentOrWantedEvicted::Resident(res.clone());
+
+        self.inner.set(value, permit);
+
+        res
+    }
+
     async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
         match tokio::fs::metadata(&self.path).await {
             Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
@@ -1690,11 +1694,6 @@ impl LayerImplMetrics {
         self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
     }
 
-    /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
-    fn inc_retried_get_or_maybe_download(&self) {
-        self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
-    }
-
     /// Expected rare because cancellations are unexpected, and failures are unexpected
     fn inc_download_failed_without_requester(&self) {
         self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
@@ -1779,7 +1778,6 @@ impl DeleteFailed {
 #[derive(enum_map::Enum)]
 enum RareEvent {
     RemoveOnDropFailed,
-    RetriedGetOrMaybeDownload,
     DownloadFailedWithoutRequester,
     UpgradedWantedEvicted,
     InitWithoutDownload,
@@ -1793,7 +1791,6 @@ impl RareEvent {
 
         match self {
             RemoveOnDropFailed => "remove_on_drop_failed",
-            RetriedGetOrMaybeDownload => "retried_gomd",
             DownloadFailedWithoutRequester => "download_failed_without",
             UpgradedWantedEvicted => "raced_wanted_evicted",
             InitWithoutDownload => "init_needed_no_download",
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index b43534efd4..e7b2eb025a 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -254,6 +254,8 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
     let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking")
         .unwrap();
     let (tenant, ctx) = h.load().await;
+    let span = h.span();
+    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
 
     let timeline = tenant
         .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
@@ -292,6 +294,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
     // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
     layer
         .keep_resident()
+        .instrument(download_span)
         .await
         .expect("keep_resident should had reinitialized without downloading")
         .expect("ResidentLayer");

From 929423cf68b7cbef1edd27c24523460d5e3ac9a8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 20 Mar 2024 18:04:14 +0000
Subject: [PATCH 3/4] add i3en.3xlarge reference numbers

---
 pageserver/benches/bench_walredo.rs | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 72c0f2c6c0..0d0e940bd5 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -24,6 +24,29 @@
 //! is called sampling. Apparently the idea here is to detect outliers.
 //! We're not sure whether the current choice of sampling method makes sense.
 //! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples
+//!
+//! # Reference Numbers
+//!
+//! 2024-03-20 on i3en.3xlarge
+//!
+//! ```text
+//! short/1                 time:   [21.612 µs 21.707 µs 21.817 µs]
+//! short/2                 time:   [27.216 µs 27.372 µs 27.557 µs]
+//! short/4                 time:   [44.398 µs 45.858 µs 47.178 µs]
+//! short/8                 time:   [81.236 µs 83.332 µs 85.419 µs]
+//! short/16                time:   [138.29 µs 139.76 µs 141.24 µs]
+//! short/32                time:   [149.67 µs 150.44 µs 151.30 µs]
+//! short/64                time:   [155.31 µs 155.90 µs 156.59 µs]
+//! short/128               time:   [156.33 µs 156.85 µs 157.44 µs]
+//! medium/1                time:   [105.47 µs 105.87 µs 106.36 µs]
+//! medium/2                time:   [157.09 µs 157.68 µs 158.40 µs]
+//! medium/4                time:   [293.69 µs 306.80 µs 318.11 µs]
+//! medium/8                time:   [594.88 µs 614.05 µs 633.18 µs]
+//! medium/16               time:   [848.28 µs 853.06 µs 858.68 µs]
+//! medium/32               time:   [916.31 µs 920.97 µs 926.27 µs]
+//! medium/64               time:   [939.99 µs 945.49 µs 951.30 µs]
+//! medium/128              time:   [918.89 µs 928.75 µs 938.41 µs]
+//! ```
 
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};

From 2726b1934ebd9d12d976ce9e9a41783d9ab238a8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 20 Mar 2024 18:07:45 +0000
Subject: [PATCH 4/4] pageserver: extra debug for test_secondary_downloads
 failures (#7183)

- Enable debug logs for this test
- Add some debug logging detail in downloader.rs
- Add an info-level message in scheduler.rs that makes it obvious if a
command is waiting for an existing task rather than spawning a new one.
---
 pageserver/src/tenant/secondary/downloader.rs    | 13 ++++++++++++-
 pageserver/src/tenant/secondary/scheduler.rs     |  1 +
 test_runner/regress/test_pageserver_secondary.py |  4 ++++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index a595096133..82af7ed83b 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -534,7 +534,11 @@ impl<'a> TenantDownloader<'a> {
             .await
             .maybe_fatal_err(&context_msg)?;
 
-        tracing::debug!("Wrote local heatmap to {}", heatmap_path);
+        tracing::debug!(
+            "Wrote local heatmap to {}, with {} timelines",
+            heatmap_path,
+            heatmap.timelines.len()
+        );
 
         // Clean up any local layers that aren't in the heatmap.  We do this first for all timelines, on the general
         // principle that deletions should be done before writes wherever possible, and so that we can use this
@@ -547,6 +551,10 @@ impl<'a> TenantDownloader<'a> {
         // Download the layers in the heatmap
         for timeline in heatmap.timelines {
             if self.secondary_state.cancel.is_cancelled() {
+                tracing::debug!(
+                    "Cancelled before downloading timeline {}",
+                    timeline.timeline_id
+                );
                 return Ok(());
             }
 
@@ -764,10 +772,13 @@ impl<'a> TenantDownloader<'a> {
             }
         };
 
+        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
+
         // Download heatmap layers that are not present on local disk, or update their
         // access time if they are already present.
         for layer in timeline.layers {
             if self.secondary_state.cancel.is_cancelled() {
+                tracing::debug!("Cancelled -- dropping out of layer loop");
                 return Ok(());
             }
 
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
index 58bdb54161..3bd7be782e 100644
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -300,6 +300,7 @@ where
 
         let tenant_shard_id = job.get_tenant_shard_id();
         let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
+            tracing::info!("Command already running, waiting for it");
             barrier
         } else {
             let running = self.spawn_now(job);
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index e664547b69..2e57136607 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -432,6 +432,10 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
      - Eviction of layers on the attached location results in deletion
        on the secondary location as well.
     """
+
+    # For debug of https://github.com/neondatabase/neon/issues/6966
+    neon_env_builder.rust_log_override = "DEBUG"
+
     neon_env_builder.num_pageservers = 2
     neon_env_builder.enable_pageserver_remote_storage(
         remote_storage_kind=RemoteStorageKind.MOCK_S3,