From 053abff71f41a2d3eefaef4c94ac7f65b6956c47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Fri, 17 Jan 2025 15:21:30 +0100
Subject: [PATCH 01/37] Fix dependency on neon-image in promote-images-dev
 (#10437)

## Problem
871e8b325f1509c0ec5cba03537297847345c02e failed CI on main because a job
ran to soon. This was caused by
ea84ec357fa4caa5a48ec65a0aab9e37d1a9fda4. While `promote-images-dev`
does not inherently need `neon-image`, a few jobs depending on
`promote-images-dev` do need it, and previously had it when it was
`promote-images`, which depended on `test-images`, which in turn
depended on `neon-image`.

## Summary of changes
To ensure jobs depending `docker.io/neondatabase/neon` images get them,
`promote-images-dev` gets the dependency to `neon-image` back which it
previously had transitively through `test-images`.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9ec5273af7..b0e07535b3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -824,7 +824,7 @@ jobs:
           docker compose -f ./docker-compose/docker-compose.yml down
 
   promote-images-dev:
-    needs: [ check-permissions, tag, vm-compute-node-image ]
+    needs: [ check-permissions, tag, vm-compute-node-image, neon-image ]
     runs-on: ubuntu-22.04
 
     permissions:

From 6975228a766bc2e5df36559a49fee0ef3417283a Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 17 Jan 2025 14:51:33 +0000
Subject: [PATCH 02/37] pageserver: add initdb metrics (#10434)

## Problem

Initdb observability is poor.

## Summary of changes

Add some metrics so we can figure out which part, if any, is slow.

Closes https://github.com/neondatabase/neon/issues/10423
---
 pageserver/src/metrics.rs | 26 ++++++++++++++++++++++++++
 pageserver/src/tenant.rs  | 14 +++++++++++++-
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 3c4830e3cd..4758aaf230 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -100,6 +100,32 @@ pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static CONCURRENT_INITDBS: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_concurrent_initdb",
+        "Number of initdb processes running"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static INITDB_SEMAPHORE_ACQUISITION_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_initdb_semaphore_seconds_global",
+        "Time spent getting a permit from the global initdb semaphore",
+        STORAGE_OP_BUCKETS.into()
+    )
+    .expect("failed to define metric")
+});
+
+pub(crate) static INITDB_RUN_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_initdb_seconds_global",
+        "Time spent performing initdb",
+        STORAGE_OP_BUCKETS.into()
+    )
+    .expect("failed to define metric")
+});
+
 // Metrics collected on operations on the storage repository.
 #[derive(
     Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f6d758ad22..bb1b36aed6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -95,6 +95,9 @@ use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::l0_flush::L0FlushGlobalState;
+use crate::metrics::CONCURRENT_INITDBS;
+use crate::metrics::INITDB_RUN_TIME;
+use crate::metrics::INITDB_SEMAPHORE_ACQUISITION_TIME;
 use crate::metrics::TENANT;
 use crate::metrics::{
     remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
@@ -5347,8 +5350,17 @@ async fn run_initdb(
         initdb_bin_path, initdb_target_dir, initdb_lib_dir,
     );
 
-    let _permit = INIT_DB_SEMAPHORE.acquire().await;
+    let _permit = {
+        let _timer = INITDB_SEMAPHORE_ACQUISITION_TIME.start_timer();
+        INIT_DB_SEMAPHORE.acquire().await
+    };
 
+    CONCURRENT_INITDBS.inc();
+    scopeguard::defer! {
+        CONCURRENT_INITDBS.dec();
+    }
+
+    let _timer = INITDB_RUN_TIME.start_timer();
     let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
         superuser: &conf.superuser,
         locale: &conf.locale,

From b0f34099f90cfa08223ed653a7c7460943f34f0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 17 Jan 2025 22:43:52 +0100
Subject: [PATCH 03/37] Add safekeeper utilization endpoint (#10429)

Add an endpoint to obtain the utilization of a safekeeper. Future
changes to the storage controller can use this endpoint to find the most
suitable safekeepers for newly created timelines, analogously to how
it's done for pageservers already.

Initially we just want to assign by timeline count, then we can iterate
from there.

Part of https://github.com/neondatabase/neon/issues/9011
---
 libs/safekeeper_api/src/models.rs      |  5 +++++
 safekeeper/client/src/mgmt_api.rs      |  5 +++++
 safekeeper/src/http/routes.rs          |  8 ++++++++
 safekeeper/src/timelines_global_map.rs | 15 +++++++++++++++
 4 files changed, 33 insertions(+)

diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index b5fa903820..30418b0efd 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -277,3 +277,8 @@ pub struct TimelineTermBumpResponse {
     pub previous_term: u64,
     pub current_term: u64,
 }
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct SafekeeperUtilization {
+    pub timeline_count: u64,
+}
diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index f78745043a..5727f32509 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -102,6 +102,11 @@ impl Client {
         self.get(&uri).await
     }
 
+    pub async fn utilization(&self) -> Result<reqwest::Response> {
+        let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint);
+        self.get(&uri).await
+    }
+
     async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
         self.request(Method::GET, uri, ()).await
     }
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 4b9fb9eb67..7ec08ecf9a 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -127,6 +127,13 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
     json_response(StatusCode::OK, ())
 }
 
+async fn utilization_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let global_timelines = get_global_timelines(&request);
+    let utilization = global_timelines.get_timeline_counts();
+    json_response(StatusCode::OK, utilization)
+}
+
 /// List all (not deleted) timelines.
 /// Note: it is possible to do the same with debug_dump.
 async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -620,6 +627,7 @@ pub fn make_router(
                 failpoints_handler(r, cancel).await
             })
         })
+        .get("/v1/uzilization", |r| request_span(r, utilization_handler))
         .delete("/v1/tenant/:tenant_id", |r| {
             request_span(r, tenant_delete_handler)
         })
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index a701534f65..01c6aff6c3 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -13,6 +13,7 @@ use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
 use camino_tempfile::Utf8TempDir;
 use safekeeper_api::membership::Configuration;
+use safekeeper_api::models::SafekeeperUtilization;
 use safekeeper_api::ServerInfo;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -416,6 +417,20 @@ impl GlobalTimelines {
             .collect()
     }
 
+    /// Returns statistics about timeline counts
+    pub fn get_timeline_counts(&self) -> SafekeeperUtilization {
+        let global_lock = self.state.lock().unwrap();
+        let timeline_count = global_lock
+            .timelines
+            .values()
+            .filter(|t| match t {
+                GlobalMapTimeline::CreationInProgress => false,
+                GlobalMapTimeline::Timeline(t) => !t.is_cancelled(),
+            })
+            .count() as u64;
+        SafekeeperUtilization { timeline_count }
+    }
+
     /// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant,
     /// and that's why it can return cancelled timelines, to retry deleting them.
     fn get_all_for_tenant(&self, tenant_id: TenantId) -> Vec<Arc<Timeline>> {

From 8bdaee35f3dec86b37bb6b91be57a88a86d9ad33 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 20 Jan 2025 09:20:31 +0000
Subject: [PATCH 04/37] pageserver: safety checks on validity of uploaded
 indices (#10403)

## Problem

Occasionally, we encounter bugs in test environments that can be
detected at the point of uploading an index, but we proceed to upload it
anyway and leave a tenant in a broken state that's awkward to handle.

## Summary of changes

- Validate index when submitting it for upload, so that we can see the
issue quickly e.g. in an API invoking compaction
- Validate index before executing the upload, so that we have a hard
enforcement that any code path that tries to upload an index will not
overwrite a valid index with an invalid one.
---
 .../src/tenant/remote_timeline_client.rs      |  6 ++
 .../tenant/remote_timeline_client/index.rs    | 15 +++++
 .../tenant/remote_timeline_client/upload.rs   |  4 ++
 .../src/tenant/storage_layer/layer/tests.rs   | 56 ++++++++++++++++---
 pageserver/src/tenant/timeline.rs             | 20 ++++++-
 5 files changed, 91 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index a006647785..bcba6d1f62 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -803,6 +803,12 @@ impl RemoteTimelineClient {
 
         upload_queue.dirty.metadata.apply(update);
 
+        // Defense in depth: if we somehow generated invalid metadata, do not persist it.
+        upload_queue
+            .dirty
+            .validate()
+            .map_err(|e| anyhow::anyhow!(e))?;
+
         self.schedule_index_upload(upload_queue);
 
         Ok(())
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 244be5bbb7..08e94ae197 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -152,6 +152,21 @@ impl IndexPart {
         };
         is_same_remote_layer_path(name, metadata, name, index_metadata)
     }
+
+    /// Check for invariants in the index: this is useful when uploading an index to ensure that if
+    /// we encounter a bug, we do not persist buggy metadata.
+    pub(crate) fn validate(&self) -> Result<(), String> {
+        if self.import_pgdata.is_none()
+            && self.metadata.ancestor_timeline().is_none()
+            && self.layer_metadata.is_empty()
+        {
+            // Unless we're in the middle of a raw pgdata import, or this is a child timeline,the index must
+            // always have at least one layer.
+            return Err("Index has no ancestor and no layers".to_string());
+        }
+
+        Ok(())
+    }
 }
 
 /// Metadata gathered for each of the layer files.
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index e434d24e5f..af4dbbbfb6 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -40,6 +40,10 @@ pub(crate) async fn upload_index_part(
     });
     pausable_failpoint!("before-upload-index-pausable");
 
+    // Safety: refuse to persist invalid index metadata, to mitigate the impact of any bug that produces this
+    // (this should never happen)
+    index_part.validate().map_err(|e| anyhow::anyhow!(e))?;
+
     // FIXME: this error comes too late
     let serialized = index_part.to_json_bytes()?;
     let serialized = Bytes::from(serialized);
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 36dcc8d805..fcb73ad20d 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,6 +1,6 @@
 use std::time::UNIX_EPOCH;
 
-use pageserver_api::key::CONTROLFILE_KEY;
+use pageserver_api::key::{Key, CONTROLFILE_KEY};
 use tokio::task::JoinSet;
 use utils::{
     completion::{self, Completion},
@@ -9,7 +9,10 @@ use utils::{
 
 use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
-use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint};
+use crate::{
+    context::DownloadBehavior,
+    tenant::{harness::test_img, storage_layer::LayerVisibilityHint},
+};
 use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
 
 /// Used in tests to advance a future to wanted await point, and not futher.
@@ -31,20 +34,51 @@ async fn smoke_test() {
 
     let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
 
+    let image_layers = vec![(
+        Lsn(0x40),
+        vec![(
+            Key::from_hex("620000000033333333444444445500000000").unwrap(),
+            test_img("foo"),
+        )],
+    )];
+
+    // Create a test timeline with one real layer, and one synthetic test layer.  The synthetic
+    // one is only there so that we can GC the real one without leaving the timeline's metadata
+    // empty, which is an illegal state (see [`IndexPart::validate`]).
     let timeline = tenant
-        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .create_test_timeline_with_layers(
+            TimelineId::generate(),
+            Lsn(0x10),
+            14,
+            &ctx,
+            Default::default(),
+            image_layers,
+            Lsn(0x100),
+        )
         .await
         .unwrap();
 
-    let layer = {
+    // Grab one of the timeline's layers to exercise in the test, and the other layer that is just
+    // there to avoid the timeline being illegally empty
+    let (layer, dummy_layer) = {
         let mut layers = {
             let layers = timeline.layers.read().await;
             layers.likely_resident_layers().cloned().collect::<Vec<_>>()
         };
 
-        assert_eq!(layers.len(), 1);
+        assert_eq!(layers.len(), 2);
 
-        layers.swap_remove(0)
+        layers.sort_by_key(|l| l.layer_desc().get_key_range().start);
+        let synthetic_layer = layers.pop().unwrap();
+        let real_layer = layers.pop().unwrap();
+        tracing::info!(
+            "real_layer={:?} ({}), synthetic_layer={:?} ({})",
+            real_layer,
+            real_layer.layer_desc().file_size,
+            synthetic_layer,
+            synthetic_layer.layer_desc().file_size
+        );
+        (real_layer, synthetic_layer)
     };
 
     // all layers created at pageserver are like `layer`, initialized with strong
@@ -173,10 +207,13 @@ async fn smoke_test() {
 
     let rtc = &timeline.remote_client;
 
+    // Simulate GC removing our test layer.
     {
-        let layers = &[layer];
         let mut g = timeline.layers.write().await;
+
+        let layers = &[layer];
         g.open_mut().unwrap().finish_gc_timeline(layers);
+
         // this just updates the remote_physical_size for demonstration purposes
         rtc.schedule_gc_update(layers).unwrap();
     }
@@ -191,7 +228,10 @@ async fn smoke_test() {
 
     rtc.wait_completion().await.unwrap();
 
-    assert_eq!(rtc.get_remote_physical_size(), 0);
+    assert_eq!(
+        rtc.get_remote_physical_size(),
+        dummy_layer.metadata().file_size
+    );
     assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2ba71416b8..5f4272fb2b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5678,9 +5678,17 @@ impl Timeline {
         info!("force created image layer {}", image_layer.local_path());
         {
             let mut guard = self.layers.write().await;
-            guard.open_mut().unwrap().force_insert_layer(image_layer);
+            guard
+                .open_mut()
+                .unwrap()
+                .force_insert_layer(image_layer.clone());
         }
 
+        // Update remote_timeline_client state to reflect existence of this layer
+        self.remote_client
+            .schedule_layer_file_upload(image_layer)
+            .unwrap();
+
         Ok(())
     }
 
@@ -5731,9 +5739,17 @@ impl Timeline {
         info!("force created delta layer {}", delta_layer.local_path());
         {
             let mut guard = self.layers.write().await;
-            guard.open_mut().unwrap().force_insert_layer(delta_layer);
+            guard
+                .open_mut()
+                .unwrap()
+                .force_insert_layer(delta_layer.clone());
         }
 
+        // Update remote_timeline_client state to reflect existence of this layer
+        self.remote_client
+            .schedule_layer_file_upload(delta_layer)
+            .unwrap();
+
         Ok(())
     }
 

From 7d761a9d22e0c3ca0e337af1793b65cb4d3f7203 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 20 Jan 2025 09:47:23 +0000
Subject: [PATCH 05/37] storage controller: make chaos less disruptive to AZ
 locality (#10438)

## Problem

Since #9916 , the chaos code is actively fighting the optimizer: tenants
tend to be attached in their preferred AZ, so most chaos migrations were
moving them to a non-preferred AZ.

## Summary of changes

- When picking migrations, prefer to migrate things _toward_ their
preferred AZ when possible. Then pick shards to move the other way when
necessary.

The resulting behavior should be an alternating "back and forth" where
the chaos code migrates thiings away from home, and then migrates them
back on the next iteration.

The side effect will be that the chaos code actively helps to push
things into their home AZ. That's not contrary to its purpose though: we
mainly just want it to continuously migrate things to exercise
migration+notification code.
---
 .../src/service/chaos_injector.rs             | 110 ++++++++++++------
 storage_controller/src/tenant_shard.rs        |  17 +++
 2 files changed, 93 insertions(+), 34 deletions(-)

diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
index 0e551beaa7..98034421d6 100644
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -1,11 +1,17 @@
-use std::{sync::Arc, time::Duration};
+use std::{
+    collections::{BTreeMap, HashMap},
+    sync::Arc,
+    time::Duration,
+};
 
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use rand::seq::SliceRandom;
 use rand::thread_rng;
 use tokio_util::sync::CancellationToken;
+use utils::id::NodeId;
+use utils::shard::TenantShardId;
 
-use super::Service;
+use super::{Node, Scheduler, Service, TenantShard};
 
 pub struct ChaosInjector {
     service: Arc<Service>,
@@ -35,50 +41,86 @@ impl ChaosInjector {
         }
     }
 
+    /// If a shard has a secondary and attached location, then re-assign the secondary to be
+    /// attached and the attached to be secondary.
+    ///
+    /// Only modifies tenants if they're in Active scheduling policy.
+    fn maybe_migrate_to_secondary(
+        &self,
+        tenant_shard_id: TenantShardId,
+        nodes: &Arc<HashMap<NodeId, Node>>,
+        tenants: &mut BTreeMap<TenantShardId, TenantShard>,
+        scheduler: &mut Scheduler,
+    ) {
+        let shard = tenants
+            .get_mut(&tenant_shard_id)
+            .expect("Held lock between choosing ID and this get");
+
+        if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
+            // Skip non-active scheduling policies, so that a shard with a policy like Pause can
+            // be pinned without being disrupted by us.
+            tracing::info!(
+                "Skipping shard {tenant_shard_id}: scheduling policy is {:?}",
+                shard.get_scheduling_policy()
+            );
+            return;
+        }
+
+        // Pick a secondary to promote
+        let Some(new_location) = shard
+            .intent
+            .get_secondary()
+            .choose(&mut thread_rng())
+            .cloned()
+        else {
+            tracing::info!(
+                "Skipping shard {tenant_shard_id}: no secondary location, can't migrate"
+            );
+            return;
+        };
+
+        let Some(old_location) = *shard.intent.get_attached() else {
+            tracing::info!("Skipping shard {tenant_shard_id}: currently has no attached location");
+            return;
+        };
+
+        tracing::info!("Injecting chaos: migrate {tenant_shard_id} {old_location}->{new_location}");
+
+        shard.intent.demote_attached(scheduler, old_location);
+        shard.intent.promote_attached(scheduler, new_location);
+        self.service.maybe_reconcile_shard(shard, nodes);
+    }
+
     async fn inject_chaos(&mut self) {
         // Pick some shards to interfere with
         let batch_size = 128;
         let mut inner = self.service.inner.write().unwrap();
         let (nodes, tenants, scheduler) = inner.parts_mut();
         let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
-        let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size);
 
-        for victim in victims {
-            let shard = tenants
-                .get_mut(victim)
-                .expect("Held lock between choosing ID and this get");
-
-            if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
-                // Skip non-active scheduling policies, so that a shard with a policy like Pause can
-                // be pinned without being disrupted by us.
-                tracing::info!(
-                    "Skipping shard {victim}: scheduling policy is {:?}",
-                    shard.get_scheduling_policy()
-                );
-                continue;
+        // Prefer to migrate tenants that are currently outside their home AZ.  This avoids the chaos injector
+        // continuously pushing tenants outside their home AZ: instead, we'll tend to cycle between picking some
+        // random tenants to move, and then on next chaos iteration moving them back, then picking some new
+        // random tenants on the next iteration.
+        let mut victims = Vec::with_capacity(batch_size);
+        for shard in tenants.values() {
+            if shard.is_attached_outside_preferred_az(nodes) {
+                victims.push(shard.tenant_shard_id);
             }
 
-            // Pick a secondary to promote
-            let Some(new_location) = shard
-                .intent
-                .get_secondary()
-                .choose(&mut thread_rng())
-                .cloned()
-            else {
-                tracing::info!("Skipping shard {victim}: no secondary location, can't migrate");
-                continue;
-            };
+            if victims.len() >= batch_size {
+                break;
+            }
+        }
 
-            let Some(old_location) = *shard.intent.get_attached() else {
-                tracing::info!("Skipping shard {victim}: currently has no attached location");
-                continue;
-            };
+        let choose_random = batch_size.saturating_sub(victims.len());
+        tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {choose_random} random shards to migrate", victims.len());
 
-            tracing::info!("Injecting chaos: migrate {victim} {old_location}->{new_location}");
+        let random_victims = tenant_ids.choose_multiple(&mut thread_rng(), choose_random);
+        victims.extend(random_victims);
 
-            shard.intent.demote_attached(scheduler, old_location);
-            shard.intent.promote_attached(scheduler, new_location);
-            self.service.maybe_reconcile_shard(shard, nodes);
+        for victim in victims {
+            self.maybe_migrate_to_secondary(victim, nodes, tenants, scheduler);
         }
     }
 }
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 79ed628c25..cbc2696b26 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1793,6 +1793,23 @@ impl TenantShard {
             }
         }
     }
+
+    /// Returns true if the tenant shard is attached to a node that is outside the preferred AZ.
+    ///
+    /// If the shard does not have a preferred AZ, returns false.
+    pub(crate) fn is_attached_outside_preferred_az(&self, nodes: &HashMap<NodeId, Node>) -> bool {
+        self.intent
+            .get_attached()
+            .map(|node_id| {
+                Some(
+                    nodes
+                        .get(&node_id)
+                        .expect("referenced node exists")
+                        .get_availability_zone_id(),
+                ) == self.intent.preferred_az_id.as_ref()
+            })
+            .unwrap_or(false)
+    }
 }
 
 impl Drop for TenantShard {

From b312a3c320695a4b528968250225dfbd40af0e2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 20 Jan 2025 13:50:44 +0100
Subject: [PATCH 06/37] Move DeleteTimelineFlow::prepare to separate function
 and use enum (#10334)

It was requested by review in #10305 to use an enum or something like it
for distinguishing the different modes instead of two parameters,
because two flags allow four combinations, and two of them don't really
make sense/ aren't used.

follow-up of #10305
---
 pageserver/src/tenant/timeline/delete.rs  | 149 +++++++++++-----------
 pageserver/src/tenant/timeline/offload.rs |  12 +-
 2 files changed, 81 insertions(+), 80 deletions(-)

diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index bdc315d985..3c828c8a9e 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -112,7 +112,7 @@ pub(super) async fn delete_local_timeline_directory(
 }
 
 /// It is important that this gets called when DeletionGuard is being held.
-/// For more context see comments in [`DeleteTimelineFlow::prepare`]
+/// For more context see comments in [`make_timeline_delete_guard`]
 async fn remove_maybe_offloaded_timeline_from_tenant(
     tenant: &Tenant,
     timeline: &TimelineOrOffloaded,
@@ -193,10 +193,8 @@ impl DeleteTimelineFlow {
     ) -> Result<(), DeleteTimelineError> {
         super::debug_assert_current_span_has_tenant_and_timeline_id();
 
-        let allow_offloaded_children = false;
-        let set_stopping = true;
         let (timeline, mut guard) =
-            Self::prepare(tenant, timeline_id, allow_offloaded_children, set_stopping)?;
+            make_timeline_delete_guard(tenant, timeline_id, TimelineDeleteGuardKind::Delete)?;
 
         guard.mark_in_progress()?;
 
@@ -333,75 +331,6 @@ impl DeleteTimelineFlow {
         Ok(())
     }
 
-    pub(super) fn prepare(
-        tenant: &Tenant,
-        timeline_id: TimelineId,
-        allow_offloaded_children: bool,
-        set_stopping: bool,
-    ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
-        // Note the interaction between this guard and deletion guard.
-        // Here we attempt to lock deletion guard when we're holding a lock on timelines.
-        // This is important because when you take into account `remove_timeline_from_tenant`
-        // we remove timeline from memory when we still hold the deletion guard.
-        // So here when timeline deletion is finished timeline wont be present in timelines map at all
-        // which makes the following sequence impossible:
-        // T1: get preempted right before the try_lock on `Timeline::delete_progress`
-        // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
-        // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
-        // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
-        let timelines = tenant.timelines.lock().unwrap();
-        let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
-
-        let timeline = match timelines.get(&timeline_id) {
-            Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
-            None => match timelines_offloaded.get(&timeline_id) {
-                Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
-                None => return Err(DeleteTimelineError::NotFound),
-            },
-        };
-
-        // Ensure that there are no child timelines, because we are about to remove files,
-        // which will break child branches
-        let mut children = Vec::new();
-        if !allow_offloaded_children {
-            children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
-                (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
-            }));
-        }
-        children.extend(timelines.iter().filter_map(|(id, entry)| {
-            (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
-        }));
-
-        if !children.is_empty() {
-            return Err(DeleteTimelineError::HasChildren(children));
-        }
-
-        // Note that using try_lock here is important to avoid a deadlock.
-        // Here we take lock on timelines and then the deletion guard.
-        // At the end of the operation we're holding the guard and need to lock timelines map
-        // to remove the timeline from it.
-        // Always if you have two locks that are taken in different order this can result in a deadlock.
-
-        let delete_progress = Arc::clone(timeline.delete_progress());
-        let delete_lock_guard = match delete_progress.try_lock_owned() {
-            Ok(guard) => DeletionGuard(guard),
-            Err(_) => {
-                // Unfortunately if lock fails arc is consumed.
-                return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
-                    timeline.delete_progress(),
-                )));
-            }
-        };
-
-        if set_stopping {
-            if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
-                timeline.set_state(TimelineState::Stopping);
-            }
-        }
-
-        Ok((timeline, delete_lock_guard))
-    }
-
     fn schedule_background(
         guard: DeletionGuard,
         conf: &'static PageServerConf,
@@ -483,6 +412,80 @@ impl DeleteTimelineFlow {
     }
 }
 
+#[derive(Copy, Clone, PartialEq, Eq)]
+pub(super) enum TimelineDeleteGuardKind {
+    Offload,
+    Delete,
+}
+
+pub(super) fn make_timeline_delete_guard(
+    tenant: &Tenant,
+    timeline_id: TimelineId,
+    guard_kind: TimelineDeleteGuardKind,
+) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
+    // Note the interaction between this guard and deletion guard.
+    // Here we attempt to lock deletion guard when we're holding a lock on timelines.
+    // This is important because when you take into account `remove_timeline_from_tenant`
+    // we remove timeline from memory when we still hold the deletion guard.
+    // So here when timeline deletion is finished timeline wont be present in timelines map at all
+    // which makes the following sequence impossible:
+    // T1: get preempted right before the try_lock on `Timeline::delete_progress`
+    // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
+    // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
+    // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
+    let timelines = tenant.timelines.lock().unwrap();
+    let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
+
+    let timeline = match timelines.get(&timeline_id) {
+        Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
+        None => match timelines_offloaded.get(&timeline_id) {
+            Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
+            None => return Err(DeleteTimelineError::NotFound),
+        },
+    };
+
+    // Ensure that there are no child timelines, because we are about to remove files,
+    // which will break child branches
+    let mut children = Vec::new();
+    if guard_kind == TimelineDeleteGuardKind::Delete {
+        children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
+            (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
+        }));
+    }
+    children.extend(timelines.iter().filter_map(|(id, entry)| {
+        (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
+    }));
+
+    if !children.is_empty() {
+        return Err(DeleteTimelineError::HasChildren(children));
+    }
+
+    // Note that using try_lock here is important to avoid a deadlock.
+    // Here we take lock on timelines and then the deletion guard.
+    // At the end of the operation we're holding the guard and need to lock timelines map
+    // to remove the timeline from it.
+    // Always if you have two locks that are taken in different order this can result in a deadlock.
+
+    let delete_progress = Arc::clone(timeline.delete_progress());
+    let delete_lock_guard = match delete_progress.try_lock_owned() {
+        Ok(guard) => DeletionGuard(guard),
+        Err(_) => {
+            // Unfortunately if lock fails arc is consumed.
+            return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
+                timeline.delete_progress(),
+            )));
+        }
+    };
+
+    if guard_kind == TimelineDeleteGuardKind::Delete {
+        if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
+            timeline.set_state(TimelineState::Stopping);
+        }
+    }
+
+    Ok((timeline, delete_lock_guard))
+}
+
 pub(super) struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
 
 impl Deref for DeletionGuard {
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 6c6b19e8b1..3b5bf8290c 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -2,10 +2,11 @@ use std::sync::Arc;
 
 use pageserver_api::models::{TenantState, TimelineState};
 
-use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
+use super::delete::{delete_local_timeline_directory, DeletionGuard};
 use super::Timeline;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
+use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind};
 use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
 
 #[derive(thiserror::Error, Debug)]
@@ -36,13 +37,10 @@ pub(crate) async fn offload_timeline(
     debug_assert_current_span_has_tenant_and_timeline_id();
     tracing::info!("offloading archived timeline");
 
-    let allow_offloaded_children = true;
-    let set_stopping = false;
-    let (timeline, guard) = DeleteTimelineFlow::prepare(
+    let (timeline, guard) = make_timeline_delete_guard(
         tenant,
         timeline.timeline_id,
-        allow_offloaded_children,
-        set_stopping,
+        TimelineDeleteGuardKind::Offload,
     )
     .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
 
@@ -106,7 +104,7 @@ pub(crate) async fn offload_timeline(
 }
 
 /// It is important that this gets called when DeletionGuard is being held.
-/// For more context see comments in [`DeleteTimelineFlow::prepare`]
+/// For more context see comments in [`make_timeline_delete_guard`]
 ///
 /// Returns the strong count of the timeline `Arc`
 fn remove_timeline_from_tenant(

From 02fc58b878d4342c05c084cd7db7a01940a70c3f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 20 Jan 2025 15:37:24 +0100
Subject: [PATCH 07/37] impr(timeline handles): add more tests covering
 reference cyle (#10446)

The other test focus on the external interface usage while the tests
added in this PR add some testing around HandleInner's lifecycle,
ensuring we don't leak it once either connection gets dropped or
per-timeline-state is shut down explicitly.
---
 pageserver/src/tenant/timeline/handle.rs | 97 ++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index 35d8c75ce1..4c7bea25be 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -1132,4 +1132,101 @@ mod tests {
         // There should be no strong references to the timeline object except the one on "stack".
         assert_eq!(Arc::strong_count(&shard0), refcount_start);
     }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_reference_cycle_broken_when_cache_is_dropped() {
+        crate::tenant::harness::setup_logging();
+        let timeline_id = TimelineId::generate();
+        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let mgr = StubManager {
+            shards: vec![shard0.clone()],
+        };
+        let key = DBDIR_KEY;
+
+        let mut cache = Cache::<TestTypes>::default();
+
+        // helper to check if a handle is referenced by per_timeline_state
+        let per_timeline_state_refs_handle = |handle_weak: &Weak<Mutex<HandleInner<_>>>| {
+            let per_timeline_state = shard0.per_timeline_state.handles.lock().unwrap();
+            let per_timeline_state = per_timeline_state.as_ref().unwrap();
+            per_timeline_state
+                .values()
+                .any(|v| Weak::ptr_eq(&Arc::downgrade(v), handle_weak))
+        };
+
+        // Fill the cache.
+        let handle = cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have the timeline");
+        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
+        let handle_inner_weak = Arc::downgrade(&handle.inner);
+        assert!(
+            per_timeline_state_refs_handle(&handle_inner_weak),
+            "we still hold `handle` _and_ haven't dropped `cache` yet"
+        );
+
+        // Drop the cache.
+        drop(cache);
+
+        assert!(
+            !(per_timeline_state_refs_handle(&handle_inner_weak)),
+            "nothing should reference the handle allocation anymore"
+        );
+        assert!(
+            Weak::upgrade(&handle_inner_weak).is_some(),
+            "the local `handle` still keeps the allocation alive"
+        );
+        // but obviously the cache is gone so no new allocations can be handed out.
+
+        // Drop handle.
+        drop(handle);
+        assert!(
+            Weak::upgrade(&handle_inner_weak).is_none(),
+            "the local `handle` is dropped, so the allocation should be dropped by now"
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_reference_cycle_broken_when_per_timeline_state_shutdown() {
+        crate::tenant::harness::setup_logging();
+        let timeline_id = TimelineId::generate();
+        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let mgr = StubManager {
+            shards: vec![shard0.clone()],
+        };
+        let key = DBDIR_KEY;
+
+        let mut cache = Cache::<TestTypes>::default();
+        let handle = cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have the timeline");
+        // grab a weak reference to the inner so can later try to Weak::upgrade it and assert that fails
+        let handle_inner_weak = Arc::downgrade(&handle.inner);
+
+        // drop the handle, obviously the lifetime of `inner` is at least as long as each strong reference to it
+        drop(handle);
+        assert!(Weak::upgrade(&handle_inner_weak).is_some(), "can still");
+
+        // Shutdown the per_timeline_state.
+        shard0.per_timeline_state.shutdown();
+        assert!(Weak::upgrade(&handle_inner_weak).is_none(), "can no longer");
+
+        // cache only contains Weak's, so, it can outlive the per_timeline_state without
+        // Drop explicitly solely to make this point.
+        drop(cache);
+    }
 }

From 2657b7ec7540df3d9060ff2ed15442ed14d7843c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 20 Jan 2025 17:33:07 +0000
Subject: [PATCH 08/37] rfcs: add sharded ingest RFC (#8754)

## Summary

Whereas currently we send all WAL to all pageserver shards, and each
shard filters out the data that it needs,
in this RFC we add a mechanism to filter the WAL on the safekeeper, so
that each shard receives
only the data it needs.

This will place some extra CPU load on the safekeepers, in exchange for
reducing the network bandwidth
for ingesting WAL back to scaling as O(1) with shard count, rather than
O(N_shards).

Touches #9329.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Vlad Lazar <vlalazar.vlad@gmail.com>
Co-authored-by: Vlad Lazar <vlad@neon.tech>
---
 docs/rfcs/041-sharded-ingest.md | 255 ++++++++++++++++++++++++++++++++
 1 file changed, 255 insertions(+)
 create mode 100644 docs/rfcs/041-sharded-ingest.md

diff --git a/docs/rfcs/041-sharded-ingest.md b/docs/rfcs/041-sharded-ingest.md
new file mode 100644
index 0000000000..47b314891c
--- /dev/null
+++ b/docs/rfcs/041-sharded-ingest.md
@@ -0,0 +1,255 @@
+# 
+Created on Aug 2024
+Implemented on Jan 2025
+
+## Summary
+
+Data in large tenants is split up between multiple pageservers according to key hashes, as
+introduced in the [sharding RFC](031-sharding-static.md) and [shard splitting RFC](032-shard-splitting.md).
+
+Whereas currently we send all WAL to all pageserver shards, and each shard filters out the data that it needs,
+in this RFC we add a mechanism to filter the WAL on the safekeeper, so that each shard receives
+only the data it needs.
+
+This will place some extra CPU load on the safekeepers, in exchange for reducing the network bandwidth
+for ingesting WAL back to scaling as O(1) with shard count, rather than O(N_shards).
+
+## Motivation
+
+1. Large databases require higher shard counts.  Whereas currently we run with up to 8 shards for tenants
+with a few TB of storage, the next order of magnitude capacity increase will require tens of shards, such
+that sending all WAL to all shards is impractical in terms of bandwidth.
+2. For contemporary database sizes (~2TB), the pageserver is the bottleneck for ingest: since each
+   shard has to decode and process the whole WAL, sharding doesn't fully relieve this bottleneck.  To achieve significantly higher ingest speeds, we need to filter the WAL earlier so that each pageserver
+   only has to process relevant parts.
+
+## Non Goals (if relevant)
+
+We do not seek to introduce multiple WALs per timeline, or to share the work of handling a timeline's
+WAL across safekeepers (beyond simple 3x replication).  This RFC may be thought of as an incremental
+move of the ingestion bottleneck up the stack: instead of high write rates bottlenecking on the
+pageserver, they will bottleneck on the safekeeper.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+Safekeeper, pageserver.
+
+There will be no control plane or storage controller coordination needed, as pageservers will directly
+indicate their sharding parameters to the safekeeper when subscribing for WAL.
+
+## Proposed implementation
+
+Terminology:
+- "Data pages" refers to postgres relation blocks, and SLRU blocks.
+- "Metadata pages" refers to everything else the pageserver stores, such as relation sizes and
+  directories of relations.
+
+### Phase 1: Refactor ingest
+
+Currently, pageserver ingest code is structured approximately as follows:
+1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network
+   socket
+2. `WalIngest::ingest_record` to translate the record into a series of page-level modifications
+3. `DatadirModification` accumulates page updates from several `ingest_record` calls, and when
+   its `commit()` method is called, flushes these into a Timeline's open `InMemoryLayer`.
+
+This process currently assumes access to a pageserver `Timeline` throughout `ingest_record` and
+from `DatadirModification`, which is used to do read-modify-write cycles on metadata pages
+such as relation sizes and the master DBDIR page.  It also assumes that records are ingested
+strictly one after the other: they cannot be ingested in parallel because each record assumes
+that earlier records' changes have already been applied to `Timeline`.
+
+This code will be refactored to disentangle the simple, fast decode of relation page writes
+from the more complex logic for updating internal metadata.  An intermediate representation
+called `InterpretedWalRecords` will be introduced.  This is similar to the internal state of
+a `DatadirModification`, but does not require access to a Timeline.  Instead of storing
+metadata updates as materialized writes to pages, it will accumulate these as abstract operations,
+for example rather than including a write to a relation size key, this structure will include
+an operation that indicates "Update relation _foo_'s size to the max of its current value and
+_bar_", such that these may be applied later to a real Timeline.
+
+The `DatadirModification` will be aware of the `EphemeralFile` format, so that as it accumulates
+simple page writes of relation blocks, it can write them directly into a buffer in the serialized
+format.  This will avoid the need to later deserialize/reserialize this data when passing the
+structure between safekeeper and pageserver.
+
+The new pipeline will be:
+1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network
+2. A `InterpretedWalRecords` is generated from the incoming WAL records.  This does not
+   require a reference to a Timeline.
+3. The logic that is current spread between `WalIngest` and `DatadirModification` for updating
+   metadata will be refactored to consume the metadata operations from the `InterpretedWalRecords`
+   and turn them into literal writes to metadata pages.  This part must be done sequentially.
+4. The resulting buffer of metadata page writes is combined with the buffer of relation block
+   writes, and written into the `InMemoryLayer`.
+
+Implemented in:
+1. https://github.com/neondatabase/neon/pull/9472
+2. https://github.com/neondatabase/neon/pull/9504
+3. https://github.com/neondatabase/neon/pull/9524
+
+### Phase 2: Decode & filter on safekeeper
+
+In the previous phase, the ingest code was modified to be able to do most of its work without access to
+a Timeline: this first stage of ingest simply converts a series of binary wal records into
+a buffer of relation/SLRU page writes, and a buffer of abstract metadata writes.
+
+The modified ingest code may be transplanted from pageserver to safekeeper (probably via a
+shared crate).  The safekeeper->pageserver network protocol is modified to:
+ - in subscription requests, send the `ShardIdentity` from the pageserver to the safekeeper
+ - in responses, transmit a `InterpretedWalRecords` instead of a raw `WalRecord`.
+ - use the `ShardIdentity` to filter the `ProcessedWalIngest` to relevant content for
+   the subscribing shard before transmitting it.
+
+The overall behavior of the pageserver->safekeeper interaction remains the same, in terms of
+consistent LSN feedback, and connection management.  Only the payload of the subscriptions
+changes, to express an LSN range of WAL as a filtered `ProcessedWalIngest` instead of the
+raw data.
+
+The ingest code on the pageserver can now skip the part where it does the first phase of
+processing, as it will receive pre-processed, compressed data off the wire.
+
+Note that `InterpretedWalRecord` batches multiple `InterpretedWalRecord(s)` in the same network
+message. Safekeeper reads WAL in chunks of 16 blocks and then decodes as many Postgres WAL records
+as possible. Each Postgres WAL record maps to one `InterpretedWalRecord` for potentially multiple shards.
+Hence, the size of the batch is given by the number of Postgres WAL records that fit in 16 blocks.
+
+The protocol needs to support evolution. Protobuf was chosen here with the view that, in the future,
+we may migrate it to GRPC altogether
+
+Implemented in:
+1. https://github.com/neondatabase/neon/pull/9746
+2. https://github.com/neondatabase/neon/pull/9821
+
+### Phase 3: Fan out interpreted WAL
+
+In the previous phase, the initial processing of WAL was moved to the safekeeper, but it is still
+done once for each shard: this will generate O(N_shards) CPU work on the safekeeper (especially
+when considering converting to Protobuf format and compression).
+
+To avoid this, we fan-out WAL from one (tenant, timeline, shard) to all other shards subscribed on
+the same safekeeper. Under normal operation, the WAL will be read from disk, decoded and interpreted
+_only_ once per (safekeeper, timeline).
+
+When the first shard of a sharded timeline subscribes to a given safekeeper a task is spawned
+for the WAL reader (`InterpretedWalReader`). This task reads WAL, decodes, interprets it and sends
+it to the sender (`InterpretedWalSender`). The sender is a future that is polled from the connection
+task. When further shards subscribe on the safekeeper they will attach themselves to the existing WAL reader.
+There's two cases to consider:
+1. The shard's requested `start_lsn` is ahead of the current position of the WAL reader. In this case, the shard
+will start receiving data when the reader reaches that LSN. The intuition here is that there's little to gain
+by letting shards "front-run" since compute backpressure is based on the laggard LSN.
+2. The shard's requested `start_lsn` is below the current position of the WAL reader. In this case, the WAL reader
+gets reset to this requested position (same intuition). Special care is taken such that advanced shards do not receive
+interpreted WAL records below their current position.
+
+The approach above implies that there is at most one WAL reader per (tenant, timeline) on a given safekeeper at any point in time.
+If this turns out to be operationally problematic, there's a trick we can deploy: `--max-delta-for-fanout` is an optional safekeeper
+argument that controls the max absolute delta between a new shard and the current WAL position of the WAL reader. If the absolute
+delta is above that value, a new reader is spawned. Note that there's currently no concurrency control on the number of WAL readers,
+so it's recommended to use large values to avoid pushing CPU utilisation too high.
+
+Unsharded tenants do not spawn a separate task for the interpreted WAL reader since there's no benefit to it. Instead they poll
+the reader and sender concurrently from the connection task.
+
+Shard splits are interesting here because it is the only case when the same shard might have two subscriptions at the same time.
+This is handled by giving readers a unique identifier. Both shards will receive the same data while respecting their requested start
+position.
+
+Implemented in:
+1. https://github.com/neondatabase/neon/pull/10190
+
+## Deployment
+
+Each phase shall be deployed independently. Special care should be taken around protocol changes.
+
+## Observability Tips
+
+* The safekeeper logs the protocol requested by the pageserver
+along with the pageserver ID, tenant, timeline and shard: `starting streaming from`.
+* There's metrics for the number of wal readers:
+  * `safekeeper_wal_readers{kind="task", target=~"pageserver.*"}` gives the number of wal reader tasks for each SK
+  * `safekeeper_wal_readers{kind="future", target=~"pageserver.*"}` gives the numer of wal readers polled inline by each SK
+  * `safekeeper_interpreted_wal_reader_tasks` gives the number of wal reader tasks per tenant, timeline
+* Interesting log lines for the fan-out reader:
+  * `Spawning interpreted`: first shard creates the interpreted wal reader
+  * `Fanning out`: a subsequent shard attaches itself to an interpreted wal reader
+  * `Aborting interpreted`: all senders have finished and the reader task is being aborted
+
+## Future Optimizations
+
+This sections describes some improvement areas which may be revisited in the future.
+
+### Buffering of Interpreted WAL
+
+The interpreted WAL reader may buffer interpreted WAL records in user space to help with serving
+subscriptions that are lagging behind the current position of the reader.
+
+Counterpoints:
+* Safekeepers serve many thousands of timelines and allocating a buffer for each might be wasteful,
+especially given that it would go unused on the happy path.
+* WAL is buffered in the kernel page cache. Usually we'd only pay the CPU cost of decoding and interpreting.
+
+### Tweaking the Pagserver Safekeeper Selection Algorithm
+
+We could make the pageserver aware of which safekeeper's already host shards for the timeline along
+with their current WAL positions. The pageserver should then prefer safkeepers that are in the same
+AZ _and_ already have a shard with a position close to the desired start position.
+
+We currently run one safekeeper per AZ, so the point is mute until that changes.
+
+### Pipelining first ingest phase
+
+The first ingest phase is a stateless transformation of a binary WAL record into a pre-processed
+output per shard.  To put multiple CPUs to work, we may pipeline this processing up to some defined buffer
+depth.
+
+## Alternatives considered
+
+### Give safekeepers enough state to fully decode WAL
+
+In this RFC, we only do the first phase of ingest on the safekeeper, because this is
+the phase that is stateless.  Subsequent changes then happen on the pageserver, with
+access to the `Timeline` state.
+
+We could do more work on the safekeeper if we transmitted metadata state to the safekeeper
+when subscribing to the WAL: for example, by telling the safekeeper all the relation sizes,
+so that it could then generate all the metadata writes for relation sizes.
+
+We avoid doing this for several reasons:
+1. Complexity: it's a more invasive protocol change
+2. Decoupling: having the safekeeper understand the `ProcessedWalIngest` already somewhat
+   infects it with knowledge of the pageserver, but this is mainly an abstract structure
+   that describes postgres writes.  However, if we taught the safekeeper about the exact
+   way that pageserver deals with metadata keys, this would be a much tighter coupling.
+3. Load: once the WAL has been processed to the point that it can be split between shards,
+   it is preferable to share out work on the remaining shards rather than adding extra CPU
+   load to the safekeeper.
+
+### Do pre-processing on the compute instead of the safekeeper
+
+Since our first stage of ingest is stateless, it could be done at any stage in the pipeline,
+all the way up to the compute.
+
+We choose not to do this, because it is useful for the safekeeper to store the raw WAL rather
+than just the preprocessed WAL:
+- The safekeeper still needs to be able to serve raw WAL back to postgres for e.g. physical replication
+- It simplifies our paxos implementation to have the offset in the write log be literally
+  the same as the LSN
+- Raw WAL must have a stable protocol since we might have to re-ingest it at arbitrary points in the future.
+  Storing raw WAL give us more flexibility to evolve the pageserver, safekeeper protocol.
+
+### Do wal pre-processing on shard 0 or a separate service, send it to other shards from there
+
+If we wanted to keep the safekeepers as entirely pure stores of raw WAL bytes, then
+we could do the initial decode and shard-splitting in some other location:
+- Shard 0 could subscribe to the full WAL and then send writes to other shards
+- A new intermediate service between the safekeeper and pageserver could do the splitting.
+
+So why not?
+- Extra network hop from shard 0 to the final destination shard
+- Clearly there is more infrastructure involved here compared with doing it inline on the safekeeper.
+- Safekeepers already have very light CPU load: typical cloud instances shapes with appropriate
+  disks for the safekeepers effectively have "free" CPU resources.
+- Doing extra work on shard 0 would complicate scheduling of shards on pageservers, because
+  shard 0 would have significantly higher CPU load under write workloads than other shards.

From 72130d7d6c975df81249b4c3862d16d4fff40cf6 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 20 Jan 2025 18:51:30 +0100
Subject: [PATCH 09/37] fix(page_service / handle): panic when parallel client
 disconnect & Timeline shutdown (#10445)

## Refs
- fixes https://github.com/neondatabase/neon/issues/10444

## Problem

We're seeing a panic `handles are only shut down once in their lifetime`
in our performance testbed.

## Hypothesis

Annotated code in
https://github.com/neondatabase/neon/issues/10444#issuecomment-2602286415.

```
T1: drop Cache, executes up to (1)
=> HandleInner is now in state ShutDown
T2: Timeline::shutdown => PerTimelineState::shutdown  executes shutdown() again => panics
```

Likely this snuck in the final touches of #10386 where I narrowed down
the locking rules.

## Summary of changes

Make duplicate shutdowns a no-op.
---
 pageserver/src/tenant/timeline/handle.rs | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index 4c7bea25be..5b39daaaf8 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -588,32 +588,40 @@ impl<T: Types> Drop for Cache<T> {
             let Some(handle_inner_arc) = handle_inner_weak.upgrade() else {
                 continue;
             };
-            let handle_timeline = handle_inner_arc
+            let Some(handle_timeline) = handle_inner_arc
                 // locking rules: drop lock before acquiring other lock below
                 .lock()
                 .expect("poisoned")
-                .shutdown();
+                .shutdown()
+            else {
+                // Concurrent PerTimelineState::shutdown.
+                continue;
+            };
+            // Clean up per_timeline_state so the HandleInner allocation can be dropped.
             let per_timeline_state = handle_timeline.per_timeline_state();
             let mut handles_lock_guard = per_timeline_state.handles.lock().expect("mutex poisoned");
             let Some(handles) = &mut *handles_lock_guard else {
                 continue;
             };
             let Some(removed_handle_inner_arc) = handles.remove(&self.id) else {
-                // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
+                // Concurrent PerTimelineState::shutdown.
                 continue;
             };
-            drop(handles_lock_guard); // locking rules: remember them when!
-            assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc,));
+            drop(handles_lock_guard); // locking rules!
+            assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc));
         }
     }
 }
 
 impl<T: Types> HandleInner<T> {
-    fn shutdown(&mut self) -> Arc<T::Timeline> {
+    fn shutdown(&mut self) -> Option<Arc<T::Timeline>> {
         match std::mem::replace(self, HandleInner::ShutDown) {
-            HandleInner::KeepingTimelineGateOpen { timeline, .. } => timeline,
+            HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline),
             HandleInner::ShutDown => {
-                unreachable!("handles are only shut down once in their lifetime");
+                // Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown
+                // may do it concurrently, but locking rules disallow holding per-timeline-state lock and
+                // the handle lock at the same time.
+                None
             }
         }
     }

From e781cf6dd82a150133621ad0165e1c6b03c844ad Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Mon, 20 Jan 2025 19:29:21 +0100
Subject: [PATCH 10/37] Compute/LFC: Apply limits consistently (#10449)

Otherwise we might hit ERRORs in otherwise safe situations (such as user
queries), which isn't a great user experience.

## Problem

https://github.com/neondatabase/neon/pull/10376

## Summary of changes

Instead of accepting internal errors as acceptable, we ensure we don't
exceed our allocated usage.
---
 pgxn/neon/file_cache.c                       | 110 ++++++++++++-------
 test_runner/regress/test_local_file_cache.py |  71 +++++++++++-
 2 files changed, 139 insertions(+), 42 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index ad5667cbab..64b236061d 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -911,57 +911,85 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			if (entry->access_count++ == 0)
 				dlist_delete(&entry->list_node);
 		}
-		else
+		/*-----------
+		 * If the chunk wasn't already in the LFC then we have these
+		 * options, in order of preference:
+		 *
+		 * Unless there is no space available, we can:
+		 *  1. Use an entry from the `holes` list, and
+		 *  2. Create a new entry.
+		 * We can always, regardless of space in the LFC:
+		 *  3. evict an entry from LRU, and
+		 *  4. ignore the write operation (the least favorite option)
+		 */
+		else if (lfc_ctl->used < lfc_ctl->limit)
 		{
-			/*
-			 * We have two choices if all cache pages are pinned (i.e. used in IO
-			 * operations):
-			 *
-			 * 1) Wait until some of this operation is completed and pages is
-			 * unpinned.
-			 *
-			 * 2) Allocate one more chunk, so that specified cache size is more
-			 * recommendation than hard limit.
-			 *
-			 * As far as probability of such event (that all pages are pinned) is
-			 * considered to be very very small: there are should be very large
-			 * number of concurrent IO operations and them are limited by
-			 * max_connections, we prefer not to complicate code and use second
-			 * approach.
-			 */
-			if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
-			{
-				/* Cache overflow: evict least recently used chunk */
-				FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
-	
-				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
-				{
-					lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
-				}
-				CriticalAssert(victim->access_count == 0);
-				entry->offset = victim->offset; /* grab victim's chunk */
-				hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
-				neon_log(DEBUG2, "Swap file cache page");
-			}
-			else if (!dlist_is_empty(&lfc_ctl->holes))
+			if (!dlist_is_empty(&lfc_ctl->holes))
 			{
 				/* We can reuse a hole that was left behind when the LFC was shrunk previously */
-				FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
-				uint32		offset = hole->offset;
-				bool		hole_found;
-	
-				hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &hole_found);
+				FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node,
+													   dlist_pop_head_node(&lfc_ctl->holes));
+				uint32 offset = hole->offset;
+				bool hole_found;
+
+				hash_search_with_hash_value(lfc_hash, &hole->key,
+											hole->hash, HASH_REMOVE, &hole_found);
 				CriticalAssert(hole_found);
-	
+
 				lfc_ctl->used += 1;
-				entry->offset = offset;	/* reuse the hole */
+				entry->offset = offset;			/* reuse the hole */
 			}
 			else
 			{
 				lfc_ctl->used += 1;
-				entry->offset = lfc_ctl->size++;	/* allocate new chunk at end
-													 * of file */
+				entry->offset = lfc_ctl->size++;/* allocate new chunk at end
+												 * of file */
 			}
+		}
+		/*
+		 * We've already used up all allocated LFC entries.
+		 *
+		 * If we can clear an entry from the LRU, do that.
+		 * If we can't (e.g. because all other slots are being accessed)
+		 * then we will remove this entry from the hash and continue
+		 * on to the next chunk, as we may not exceed the limit.
+		 */
+		else if (!dlist_is_empty(&lfc_ctl->lru))
+		{
+			/* Cache overflow: evict least recently used chunk */
+			FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node,
+													 dlist_pop_head_node(&lfc_ctl->lru));
+
+			for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+			{
+				lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
+			}
+
+			CriticalAssert(victim->access_count == 0);
+			entry->offset = victim->offset; /* grab victim's chunk */
+			hash_search_with_hash_value(lfc_hash, &victim->key,
+										victim->hash, HASH_REMOVE, NULL);
+			neon_log(DEBUG2, "Swap file cache page");
+		}
+		else
+		{
+			/* Can't add this chunk - we don't have the space for it */
+			hash_search_with_hash_value(lfc_hash, &entry->key, hash,
+										HASH_REMOVE, NULL);
+
+			/*
+			 * We can't process this chunk due to lack of space in LFC,
+			 * so skip to the next one
+			 */
+			LWLockRelease(lfc_lock);
+			blkno += blocks_in_chunk;
+			buf_offset += blocks_in_chunk;
+			nblocks -= blocks_in_chunk;
+			continue;
+		}
+
+		if (!found)
+		{
 			entry->access_count = 1;
 			entry->hash = hash;
 			memset(entry->bitmap, 0, sizeof entry->bitmap);
diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py
index 21c9e97a42..52ee2f32a2 100644
--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -7,9 +7,78 @@ import threading
 import time
 
 import pytest
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 from fixtures.utils import USE_LFC, query_scalar
 
+"""
+Test whether LFC doesn't error out when the LRU is empty, but the LFC is
+already at its maximum size.
+
+If we don't handle this safely, we might allocate more hash entries than
+otherwise considered safe, thus causing ERRORs in hash_search(HASH_ENTER) once
+we hit lfc->used >= lfc->limit.
+"""
+
+
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
+def test_local_file_cache_all_pinned(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "neon.max_file_cache_size='1MB'",
+            "neon.file_cache_size_limit='1MB'",
+        ],
+    )
+    top_cur = endpoint.connect().cursor()
+
+    stop = threading.Event()
+    n_rows = 10000
+    n_threads = 5
+    n_updates_per_connection = 1000
+
+    top_cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)")
+    top_cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g")
+
+    # Start threads that will perform random UPDATEs. Each UPDATE
+    # increments the counter on the row, so that we can check at the
+    # end that the sum of all the counters match the number of updates
+    # performed (plus the initial 1 on each row).
+    #
+    # Furthermore, each thread will reconnect between every 1000 updates.
+    def run_updates(n_updates_performed_q: queue.Queue[int]):
+        n_updates_performed = 0
+        conn = endpoint.connect()
+        cur = conn.cursor()
+        while not stop.is_set():
+            id = random.randint(1, n_rows)
+            cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}")
+            n_updates_performed += 1
+            if n_updates_performed % n_updates_per_connection == 0:
+                cur.close()
+                conn.close()
+                conn = endpoint.connect()
+                cur = conn.cursor()
+        n_updates_performed_q.put(n_updates_performed)
+
+    n_updates_performed_q: queue.Queue[int] = queue.Queue()
+    threads: list[threading.Thread] = []
+    for _i in range(n_threads):
+        thread = threading.Thread(target=run_updates, args=(n_updates_performed_q,), daemon=True)
+        thread.start()
+        threads.append(thread)
+
+    time.sleep(15)
+
+    stop.set()
+
+    n_updates_performed = 0
+    for thread in threads:
+        thread.join()
+        n_updates_performed += n_updates_performed_q.get()
+
+    assert query_scalar(top_cur, "SELECT SUM(n) FROM lfctest") == n_rows + n_updates_performed
+
 
 @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):

From 2de2b26c62016cce48cbc5449d44e3259f237b56 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 20 Jan 2025 15:44:12 -0500
Subject: [PATCH 11/37] feat(pageserver): add reldir migration configs (#10439)

## Problem

Part of #9516 per RFC at https://github.com/neondatabase/neon/pull/10412

## Summary of changes

Adding the necessary config items and index_part items for the large
relation count work.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/pageserver.rs               |   5 +
 libs/pageserver_api/src/config.rs             |   5 +
 libs/pageserver_api/src/models.rs             |   6 +
 pageserver/src/tenant.rs                      |   1 +
 pageserver/src/tenant/config.rs               |   8 ++
 .../tenant/remote_timeline_client/index.rs    | 115 +++++++++++++++++-
 .../regress/test_attach_tenant_config.py      |   1 +
 7 files changed, 138 insertions(+), 3 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index ef5b3d6593..df81b44f2d 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -418,6 +418,11 @@ impl PageServerNode {
                 .map(serde_json::from_str)
                 .transpose()
                 .context("parse `wal_receiver_protocol_override` from json")?,
+            rel_size_v2_enabled: settings
+                .remove("rel_size_v2_enabled")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'rel_size_v2_enabled' as bool")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 09cfbc55fd..7fb7a9d54e 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -301,6 +301,10 @@ pub struct TenantConfigToml {
     pub timeline_offloading: bool,
 
     pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
+
+    /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
+    /// `index_part.json`, and it cannot be reversed.
+    pub rel_size_v2_enabled: Option<bool>,
 }
 
 pub mod defaults {
@@ -538,6 +542,7 @@ impl Default for TenantConfigToml {
             lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
             timeline_offloading: false,
             wal_receiver_protocol_override: None,
+            rel_size_v2_enabled: None,
         }
     }
 }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index c38af9cb80..1538134c96 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -497,6 +497,8 @@ pub struct TenantConfigPatch {
     pub timeline_offloading: FieldPatch<bool>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub rel_size_v2_enabled: FieldPatch<bool>,
 }
 
 /// An alternative representation of `pageserver::tenant::TenantConf` with
@@ -528,6 +530,7 @@ pub struct TenantConfig {
     pub lsn_lease_length_for_ts: Option<String>,
     pub timeline_offloading: Option<bool>,
     pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
+    pub rel_size_v2_enabled: Option<bool>,
 }
 
 impl TenantConfig {
@@ -557,6 +560,7 @@ impl TenantConfig {
             mut lsn_lease_length_for_ts,
             mut timeline_offloading,
             mut wal_receiver_protocol_override,
+            mut rel_size_v2_enabled,
         } = self;
 
         patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -601,6 +605,7 @@ impl TenantConfig {
         patch
             .wal_receiver_protocol_override
             .apply(&mut wal_receiver_protocol_override);
+        patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
 
         Self {
             checkpoint_distance,
@@ -627,6 +632,7 @@ impl TenantConfig {
             lsn_lease_length_for_ts,
             timeline_offloading,
             wal_receiver_protocol_override,
+            rel_size_v2_enabled,
         }
     }
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index bb1b36aed6..05a311391c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5475,6 +5475,7 @@ pub(crate) mod harness {
                 lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
                 timeline_offloading: Some(tenant_conf.timeline_offloading),
                 wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
+                rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled,
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index edf2e6a3aa..14d8e9ccd4 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -357,6 +357,9 @@ pub struct TenantConfOpt {
 
     #[serde(skip_serializing_if = "Option::is_none")]
     pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub rel_size_v2_enabled: Option<bool>,
 }
 
 impl TenantConfOpt {
@@ -425,6 +428,7 @@ impl TenantConfOpt {
             wal_receiver_protocol_override: self
                 .wal_receiver_protocol_override
                 .or(global_conf.wal_receiver_protocol_override),
+            rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled),
         }
     }
 
@@ -454,6 +458,7 @@ impl TenantConfOpt {
             mut lsn_lease_length_for_ts,
             mut timeline_offloading,
             mut wal_receiver_protocol_override,
+            mut rel_size_v2_enabled,
         } = self;
 
         patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -522,6 +527,7 @@ impl TenantConfOpt {
         patch
             .wal_receiver_protocol_override
             .apply(&mut wal_receiver_protocol_override);
+        patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
 
         Ok(Self {
             checkpoint_distance,
@@ -548,6 +554,7 @@ impl TenantConfOpt {
             lsn_lease_length_for_ts,
             timeline_offloading,
             wal_receiver_protocol_override,
+            rel_size_v2_enabled,
         })
     }
 }
@@ -603,6 +610,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
             timeline_offloading: value.timeline_offloading,
             wal_receiver_protocol_override: value.wal_receiver_protocol_override,
+            rel_size_v2_enabled: value.rel_size_v2_enabled,
         }
     }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 08e94ae197..30b6b07ca3 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -79,6 +79,24 @@ pub struct IndexPart {
     /// when this flag is introduced.
     #[serde(skip_serializing_if = "Option::is_none", default)]
     pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
+
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub(crate) rel_size_migration: Option<RelSizeMigration>,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub enum RelSizeMigration {
+    /// The tenant is using the old rel_size format.
+    /// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
+    /// `None` is the same as `Some(RelSizeMigration::Legacy)`.
+    Legacy,
+    /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
+    /// persisted in the index part. The read path will read both formats and merge them.
+    Migrating,
+    /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
+    /// in the index part, and the read path will not read the old format.
+    Migrated,
 }
 
 impl IndexPart {
@@ -97,10 +115,11 @@ impl IndexPart {
     /// - 8: added `archived_at`
     /// - 9: +gc_blocking
     /// - 10: +import_pgdata
-    const LATEST_VERSION: usize = 10;
+    /// - 11: +rel_size_migration
+    const LATEST_VERSION: usize = 11;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
@@ -116,6 +135,7 @@ impl IndexPart {
             gc_blocking: None,
             last_aux_file_policy: None,
             import_pgdata: None,
+            rel_size_migration: None,
         }
     }
 
@@ -416,6 +436,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: None,
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -461,6 +482,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: None,
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -507,6 +529,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: None,
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -556,6 +579,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: None,
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -600,6 +624,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: None,
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -647,6 +672,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: None,
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -699,6 +725,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: Some(AuxFilePolicy::V2),
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -756,6 +783,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: Default::default(),
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -814,6 +842,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: Default::default(),
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -877,6 +906,7 @@ mod tests {
             last_aux_file_policy: Default::default(),
             archived_at: None,
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -952,7 +982,86 @@ mod tests {
                 started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"),
                 finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"),
                 idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
-            })))
+            }))),
+            rel_size_migration: None,
+        };
+
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    #[test]
+    fn v11_rel_size_migration_is_parsed() {
+        let example = r#"{
+            "version": 11,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata": {
+                "disk_consistent_lsn": "0/16960E8",
+                "prev_record_lsn": "0/1696070",
+                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/1696070",
+                "initdb_lsn": "0/1696070",
+                "pg_version": 14
+            },
+            "gc_blocking": {
+                "started_at": "2024-07-19T09:00:00.123",
+                "reasons": ["DetachAncestor"]
+            },
+            "import_pgdata": {
+                "V1": {
+                    "Done": {
+                        "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5",
+                        "started_at": "2024-11-13T09:23:42.123",
+                        "finished_at": "2024-11-13T09:42:23.123"
+                    }
+                }
+            },
+            "rel_size_migration": "legacy"
+        }"#;
+
+        let expected = IndexPart {
+            version: 11,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::new(
+                Lsn::from_str("0/16960E8").unwrap(),
+                Some(Lsn::from_str("0/1696070").unwrap()),
+                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
+                Lsn::INVALID,
+                Lsn::from_str("0/1696070").unwrap(),
+                Lsn::from_str("0/1696070").unwrap(),
+                14,
+            ).with_recalculated_checksum().unwrap(),
+            deleted_at: None,
+            lineage: Default::default(),
+            gc_blocking: Some(GcBlocking {
+                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
+                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
+            }),
+            last_aux_file_policy: Default::default(),
+            archived_at: None,
+            import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{
+                started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"),
+                finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"),
+                idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
+            }))),
+            rel_size_migration: Some(RelSizeMigration::Legacy),
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 45112fd67e..b34dbddc80 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -176,6 +176,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
             "type": "interpreted",
             "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
         },
+        "rel_size_v2_enabled": True,
     }
 
     vps_http = env.storage_controller.pageserver_api()

From 2ab9f6982590cd8570f505df23c134cc71a1a576 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 20 Jan 2025 22:57:15 +0100
Subject: [PATCH 12/37] Simplify pageserver_physical_gc function (#10104)

This simplifies the code in `pageserver_physical_gc` a little bit after
the feedback in #10007 that the code is too complicated.

Most importantly, we don't pass around `GcSummary` any more in a
complicated fashion, and we save on async stream-combinator-inception in
one place in favour of `try_stream!{}`.

Follow-up of #10007
---
 .../src/pageserver_physical_gc.rs             | 86 +++++++++----------
 1 file changed, 40 insertions(+), 46 deletions(-)

diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index a997373375..063c6bcfb9 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -8,6 +8,8 @@ use crate::checks::{
 };
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES};
+use async_stream::try_stream;
+use futures::future::Either;
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::remote_timeline_client::manifest::OffloadedTimelineManifest;
@@ -578,7 +580,7 @@ async fn gc_timeline(
     target: &RootTarget,
     mode: GcMode,
     ttid: TenantShardTimelineId,
-    accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
+    accumulator: &std::sync::Mutex<TenantRefAccumulator>,
     tenant_manifest_info: Arc<Option<RemoteTenantManifestInfo>>,
 ) -> anyhow::Result<GcSummary> {
     let mut summary = GcSummary::default();
@@ -721,9 +723,9 @@ pub async fn pageserver_physical_gc(
 
     let remote_client = Arc::new(remote_client);
     let tenants = if tenant_shard_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants(&remote_client, &target))
+        Either::Left(stream_tenants(&remote_client, &target))
     } else {
-        futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok)))
+        Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok)))
     };
 
     // How many tenants to process in parallel.  We need to be mindful of pageservers
@@ -731,16 +733,16 @@ pub async fn pageserver_physical_gc(
     const CONCURRENCY: usize = 32;
 
     // Accumulate information about each tenant for cross-shard GC step we'll do at the end
-    let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
+    let accumulator = std::sync::Mutex::new(TenantRefAccumulator::default());
+
+    // Accumulate information about how many manifests we have GCd
+    let manifest_gc_summary = std::sync::Mutex::new(GcSummary::default());
 
     // Generate a stream of TenantTimelineId
-    enum GcSummaryOrContent<T> {
-        Content(T),
-        GcSummary(GcSummary),
-    }
     let timelines = tenants.map_ok(|tenant_shard_id| {
         let target_ref = &target;
         let remote_client_ref = &remote_client;
+        let manifest_gc_summary_ref = &manifest_gc_summary;
         async move {
             let gc_manifest_result = gc_tenant_manifests(
                 remote_client_ref,
@@ -757,55 +759,48 @@ pub async fn pageserver_physical_gc(
                     (GcSummary::default(), None)
                 }
             };
+            manifest_gc_summary_ref
+                .lock()
+                .unwrap()
+                .merge(summary_from_manifest);
             let tenant_manifest_arc = Arc::new(tenant_manifest_opt);
-            let summary_from_manifest = Ok(GcSummaryOrContent::<(_, _)>::GcSummary(
-                summary_from_manifest,
-            ));
-            stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id)
-                .await
-                .map(|stream| {
-                    stream
-                        .zip(futures::stream::iter(std::iter::repeat(
-                            tenant_manifest_arc,
-                        )))
-                        .map(|(ttid_res, tenant_manifest_arc)| {
-                            ttid_res.map(move |ttid| {
-                                GcSummaryOrContent::Content((ttid, tenant_manifest_arc))
-                            })
-                        })
-                        .chain(futures::stream::iter([summary_from_manifest].into_iter()))
-                })
+            let mut timelines = Box::pin(
+                stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id).await?,
+            );
+            Ok(try_stream! {
+                while let Some(ttid_res) = timelines.next().await {
+                    let ttid = ttid_res?;
+                    yield (ttid, tenant_manifest_arc.clone());
+                }
+            })
         }
     });
-    let timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
-    let timelines = timelines.try_flatten();
 
     let mut summary = GcSummary::default();
-
-    // Drain futures for per-shard GC, populating accumulator as a side effect
     {
-        let timelines = timelines.map_ok(|summary_or_ttid| match summary_or_ttid {
-            GcSummaryOrContent::Content((ttid, tenant_manifest_arc)) => {
-                futures::future::Either::Left(gc_timeline(
-                    &remote_client,
-                    &min_age,
-                    &target,
-                    mode,
-                    ttid,
-                    &accumulator,
-                    tenant_manifest_arc,
-                ))
-            }
-            GcSummaryOrContent::GcSummary(gc_summary) => {
-                futures::future::Either::Right(futures::future::ok(gc_summary))
-            }
+        let timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
+        let timelines = timelines.try_flatten();
+
+        let timelines = timelines.map_ok(|(ttid, tenant_manifest_arc)| {
+            gc_timeline(
+                &remote_client,
+                &min_age,
+                &target,
+                mode,
+                ttid,
+                &accumulator,
+                tenant_manifest_arc,
+            )
         });
         let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
+        // Drain futures for per-shard GC, populating accumulator as a side effect
         while let Some(i) = timelines.next().await {
             summary.merge(i?);
         }
     }
+    // Streams are lazily evaluated, so only now do we have access to the inner object
+    summary.merge(manifest_gc_summary.into_inner().unwrap());
 
     // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
     let Some(client) = controller_client else {
@@ -813,8 +808,7 @@ pub async fn pageserver_physical_gc(
         return Ok(summary);
     };
 
-    let (ancestor_shards, ancestor_refs) = Arc::into_inner(accumulator)
-        .unwrap()
+    let (ancestor_shards, ancestor_refs) = accumulator
         .into_inner()
         .unwrap()
         .into_gc_ancestors(client, &mut summary)

From 624a5075444a92a199378e80182fad4eecf8e509 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Tue, 21 Jan 2025 13:45:21 +0100
Subject: [PATCH 13/37] Create Github releases with empty body for now (#10448)

## Problem
When releasing `release-7574`, the Github Release creation failed with
"body is too long" (see
https://github.com/neondatabase/neon/actions/runs/12834025431/job/35792346745#step:5:77).
There's lots of room for improvement of the release notes, but for now
we'll disable them instead.

## Summary of changes
- Disable automatic generation of release notes for Github releases
- Enable creation of Github releases for proxy/compute
---
 .github/workflows/build_and_test.yml | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b0e07535b3..4fc81dccaa 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1078,12 +1078,6 @@ jobs:
               console.log(`Tag ${tag} created successfully.`);
             }
 
-            // TODO: check how GitHub releases looks for proxy/compute releases and enable them if they're ok
-            if (context.ref !== 'refs/heads/release') {
-              console.log(`GitHub release skipped for ${context.ref}.`);
-              return;
-            }
-
             try {
               const existingRelease = await github.rest.repos.getReleaseByTag({
                 owner: context.repo.owner,
@@ -1102,7 +1096,8 @@ jobs:
                 owner: context.repo.owner,
                 repo: context.repo.repo,
                 tag_name: tag,
-                generate_release_notes: true,
+                // TODO: Automate release notes properly
+                generate_release_notes: false,
               });
               console.log(`Release for tag ${tag} created successfully.`);
             }

From 7e4a39ea539abedf78e885797e23923f1d5e2873 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 21 Jan 2025 16:40:04 +0100
Subject: [PATCH 14/37] Fix two flakiness sources in
 test_scrubber_physical_gc_ancestors (#10457)

We currently have some flakiness in
`test_scrubber_physical_gc_ancestors`, see #10391.

The first flakiness kind is about the reconciler not actually becoming
idle within the timeout of 30 seconds. We see continuous forward
progress so this is likely not a hang. We also see this happen in
parallel to a test failure, so is likely due to runners being
overloaded. Therefore, we increase the timeout.

The second flakiness kind is an assertion failure. This one is a little
bit more tricky, but we saw in the successful run that there was some
advance of the lsn between the compaction ran (which created layer
files) and the gc run. Apparently gc rejects reductions to the single
image layer setting if the cutoff lsn is the same as the lsn of the
image layer: it will claim that that layer is newer than the space
cutoff and therefore skip it, while thinking the old layer (that we want
to delete) is the latest one (so it's not deleted).

We address the second flakiness kind by inserting a tiny amount of WAL
between the compaction and gc. This should hopefully fix things.

Related issue: #10391

(not closing it with the merger of the PR as we'll need to validate that
these changes had the intended effect).

Thanks to Chi for going over this together with me in a call.
---
 test_runner/regress/test_storage_scrubber.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 220c428531..a782e85567 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -227,7 +227,9 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
     new_shard_count = 4
     assert shard_count is None or new_shard_count > shard_count
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
-    env.storage_controller.reconcile_until_idle()  # Move shards to their final locations immediately
+    env.storage_controller.reconcile_until_idle(
+        timeout_secs=120
+    )  # Move shards to their final locations immediately
 
     # Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors
     env.storage_controller.pageserver_api().timeline_create(
@@ -269,6 +271,8 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
         ps.http_client().timeline_compact(
             shard, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True
         )
+        # Add some WAL so that we don't gc at the latest remote consistent lsn
+        workload.churn_rows(1)
         ps.http_client().timeline_gc(shard, timeline_id, 0)
 
     # We will use a min_age_secs=1 threshold for deletion, let it pass

From 19bf7b78a0c38508753d3f54e8faea695f686d19 Mon Sep 17 00:00:00 2001
From: Gleb Novikov <NanoBjorn@users.noreply.github.com>
Date: Tue, 21 Jan 2025 16:50:44 +0000
Subject: [PATCH 15/37] fast import: basic python test (#10271)

We did not have any tests on fast_import binary yet.

In this PR I have introduced:
- `FastImport` class and tools for testing in python
- basic test that runs fast import against vanilla postgres and checks
that data is there

Should be merged after https://github.com/neondatabase/neon/pull/10251
---
 compute_tools/src/bin/fast_import.rs      |  21 ++++-
 test_runner/conftest.py                   |   1 +
 test_runner/fixtures/fast_import.py       | 104 ++++++++++++++++++++++
 test_runner/regress/test_import_pgdata.py |  42 ++++++++-
 4 files changed, 165 insertions(+), 3 deletions(-)
 create mode 100644 test_runner/fixtures/fast_import.py

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 5b008f8182..c8440afb64 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -58,6 +58,8 @@ struct Args {
     pg_bin_dir: Utf8PathBuf,
     #[clap(long)]
     pg_lib_dir: Utf8PathBuf,
+    #[clap(long)]
+    pg_port: Option<u16>, // port to run postgres on, 5432 is default
 }
 
 #[serde_with::serde_as]
@@ -74,6 +76,13 @@ enum EncryptionSecret {
     KMS { key_id: String },
 }
 
+// copied from pageserver_api::config::defaults::DEFAULT_LOCALE to avoid dependency just for a constant
+const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
+    "C"
+} else {
+    "C.UTF-8"
+};
+
 #[tokio::main]
 pub(crate) async fn main() -> anyhow::Result<()> {
     utils::logging::init(
@@ -97,6 +106,10 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     let working_directory = args.working_directory;
     let pg_bin_dir = args.pg_bin_dir;
     let pg_lib_dir = args.pg_lib_dir;
+    let pg_port = args.pg_port.unwrap_or_else(|| {
+        info!("pg_port not specified, using default 5432");
+        5432
+    });
 
     // Initialize AWS clients only if s3_prefix is specified
     let (aws_config, kms_client) = if args.s3_prefix.is_some() {
@@ -180,7 +193,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded
     postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
         superuser,
-        locale: "en_US.UTF-8", // XXX: this shouldn't be hard-coded,
+        locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
         pg_version,
         initdb_bin: pg_bin_dir.join("initdb").as_ref(),
         library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
@@ -197,6 +210,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     let mut postgres_proc = tokio::process::Command::new(pgbin)
         .arg("-D")
         .arg(&pgdata_dir)
+        .args(["-p", &format!("{pg_port}")])
         .args(["-c", "wal_level=minimal"])
         .args(["-c", "shared_buffers=10GB"])
         .args(["-c", "max_wal_senders=0"])
@@ -216,6 +230,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             ),
         ])
         .env_clear()
+        .env("LD_LIBRARY_PATH", &pg_lib_dir)
         .stdout(std::process::Stdio::piped())
         .stderr(std::process::Stdio::piped())
         .spawn()
@@ -232,7 +247,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
 
     // Create neondb database in the running postgres
     let restore_pg_connstring =
-        format!("host=localhost port=5432 user={superuser} dbname=postgres");
+        format!("host=localhost port={pg_port} user={superuser} dbname=postgres");
 
     let start_time = std::time::Instant::now();
 
@@ -314,6 +329,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             .arg(&source_connection_string)
             // how we run it
             .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir)
             .kill_on_drop(true)
             .stdout(std::process::Stdio::piped())
             .stderr(std::process::Stdio::piped())
@@ -347,6 +363,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             .arg(&dumpdir)
             // how we run it
             .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir)
             .kill_on_drop(true)
             .stdout(std::process::Stdio::piped())
             .stderr(std::process::Stdio::piped())
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 9e32469d69..4b591d3316 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -15,4 +15,5 @@ pytest_plugins = (
     "fixtures.compare_fixtures",
     "fixtures.slow",
     "fixtures.reruns",
+    "fixtures.fast_import",
 )
diff --git a/test_runner/fixtures/fast_import.py b/test_runner/fixtures/fast_import.py
new file mode 100644
index 0000000000..33248132ab
--- /dev/null
+++ b/test_runner/fixtures/fast_import.py
@@ -0,0 +1,104 @@
+import os
+import shutil
+import subprocess
+import tempfile
+from collections.abc import Iterator
+from pathlib import Path
+
+import pytest
+
+from fixtures.log_helper import log
+from fixtures.neon_cli import AbstractNeonCli
+from fixtures.pg_version import PgVersion
+
+
+class FastImport(AbstractNeonCli):
+    COMMAND = "fast_import"
+    cmd: subprocess.CompletedProcess[str] | None = None
+
+    def __init__(
+        self,
+        extra_env: dict[str, str] | None,
+        binpath: Path,
+        pg_distrib_dir: Path,
+        pg_version: PgVersion,
+        workdir: Path,
+    ):
+        if extra_env is None:
+            env_vars = {}
+        else:
+            env_vars = extra_env.copy()
+
+        if not (binpath / self.COMMAND).exists():
+            raise Exception(f"{self.COMMAND} binary not found at '{binpath}'")
+        super().__init__(env_vars, binpath)
+
+        pg_dir = pg_distrib_dir / pg_version.v_prefixed
+        self.pg_distrib_dir = pg_distrib_dir
+        self.pg_version = pg_version
+        self.pg_bin = pg_dir / "bin"
+        if not (self.pg_bin / "postgres").exists():
+            raise Exception(f"postgres binary was not found at '{self.pg_bin}'")
+        self.pg_lib = pg_dir / "lib"
+        if env_vars.get("LD_LIBRARY_PATH") is not None:
+            self.pg_lib = Path(env_vars["LD_LIBRARY_PATH"])
+        elif os.getenv("LD_LIBRARY_PATH") is not None:
+            self.pg_lib = Path(str(os.getenv("LD_LIBRARY_PATH")))
+        if not workdir.exists():
+            raise Exception(f"Working directory '{workdir}' does not exist")
+        self.workdir = workdir
+
+    def run(
+        self,
+        pg_port: int,
+        source_connection_string: str | None = None,
+        s3prefix: str | None = None,
+        interactive: bool = False,
+    ) -> subprocess.CompletedProcess[str]:
+        if self.cmd is not None:
+            raise Exception("Command already executed")
+        args = [
+            f"--pg-bin-dir={self.pg_bin}",
+            f"--pg-lib-dir={self.pg_lib}",
+            f"--pg-port={pg_port}",
+            f"--working-directory={self.workdir}",
+        ]
+        if source_connection_string is not None:
+            args.append(f"--source-connection-string={source_connection_string}")
+        if s3prefix is not None:
+            args.append(f"--s3-prefix={s3prefix}")
+        if interactive:
+            args.append("--interactive")
+
+        self.cmd = self.raw_cli(args)
+        return self.cmd
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        if self.workdir.exists():
+            shutil.rmtree(self.workdir)
+
+
+@pytest.fixture(scope="function")
+def fast_import(
+    pg_version: PgVersion,
+    test_output_dir: Path,
+    neon_binpath: Path,
+    pg_distrib_dir: Path,
+) -> Iterator[FastImport]:
+    workdir = Path(tempfile.mkdtemp())
+    with FastImport(None, neon_binpath, pg_distrib_dir, pg_version, workdir) as fi:
+        yield fi
+
+        if fi.cmd is None:
+            return
+
+        # dump stdout & stderr into test log dir
+        with open(test_output_dir / "fast_import.stdout", "w") as f:
+            f.write(fi.cmd.stdout)
+        with open(test_output_dir / "fast_import.stderr", "w") as f:
+            f.write(fi.cmd.stderr)
+
+        log.info("Written logs to %s", test_output_dir)
diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index 6ea2393a9d..d02a9d19db 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -7,13 +7,15 @@ import psycopg2
 import psycopg2.errors
 import pytest
 from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.fast_import import FastImport
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, VanillaPostgres
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PgProtocol, VanillaPostgres
 from fixtures.pageserver.http import (
     ImportPgdataIdemptencyKey,
     PageserverApiException,
 )
 from fixtures.pg_version import PgVersion
+from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind
 from fixtures.utils import run_only_on_postgres
 from pytest_httpserver import HTTPServer
@@ -313,3 +315,41 @@ def test_pgdata_import_smoke(
     validate_vanilla_equivalence(br_initdb_endpoint)
     with pytest.raises(psycopg2.errors.UndefinedTable):
         br_initdb_endpoint.safe_psql("select * from othertable")
+
+
+@run_only_on_postgres(
+    [PgVersion.V14, PgVersion.V15, PgVersion.V16],
+    "newer control file catalog version and struct format isn't supported",
+)
+def test_fast_import_binary(
+    test_output_dir,
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    fast_import: FastImport,
+):
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);")
+
+    pg_port = port_distributor.get_port()
+    fast_import.run(pg_port, vanilla_pg.connstr())
+    vanilla_pg.stop()
+
+    pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version)
+    with VanillaPostgres(
+        fast_import.workdir / "pgdata", pgbin, pg_port, False
+    ) as new_pgdata_vanilla_pg:
+        new_pgdata_vanilla_pg.start()
+
+        # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres
+        conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb")
+        res = conn.safe_psql("SELECT count(*) FROM foo;")
+        log.info(f"Result: {res}")
+        assert res[0][0] == 10
+
+
+# TODO: Maybe test with pageserver?
+# 1. run whole neon env
+# 2. create timeline with some s3 path???
+# 3. run fast_import with s3 prefix
+# 4. ??? mock http where pageserver will report progress
+# 5. run compute on this timeline and check if data is there

From 737888e5c99474b2c411418a97f2eb16d825aa07 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 21 Jan 2025 20:17:14 +0100
Subject: [PATCH 16/37] Remove the tests for `pg_anon` (#10382)

## Problem
We are removing the `pg_anon` v1 extension from Neon. So we don't need
to test it anymore and can remove the code for simplicity.
## Summary of changes
The code required for testing `pg_anon` is removed.
---
 compute/compute-node.Dockerfile           |  6 -----
 docker-compose/compute_wrapper/Dockerfile |  3 ---
 docker-compose/docker_compose_test.sh     | 30 +----------------------
 3 files changed, 1 insertion(+), 38 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 8c7200c5cb..dbe7de046b 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1347,9 +1347,6 @@ COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
 #COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
 #COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
-#pg_anon is not supported yet for pg v17 so, don't fail if nothing found
-COPY --from=pg-anon-pg-build /pg_anon.tar.g? /ext-src
-COPY compute/patches/pg_anon.patch /ext-src
 COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
 COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
 RUN cd /ext-src/ && for f in *.tar.gz; \
@@ -1360,9 +1357,6 @@ RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
 RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
-RUN case "${PG_VERSION}" in "v17") \
-    echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
-    esac && patch -p1 </ext-src/pg_anon.patch
 RUN patch -p1 </ext-src/pg_cron.patch
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile
index 05a2cf124c..e2e5bc7248 100644
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -10,10 +10,7 @@ USER root
 RUN apt-get update &&       \
     apt-get install -y curl \
                        jq   \
-                       python3-pip \
                        netcat-openbsd
-#Faker is required for the pg_anon test
-RUN case $COMPUTE_IMAGE in compute-node-v17) OPT="--break-system-packages";; *) OPT= ;; esac && pip3 install $OPT Faker
 #This is required for the pg_hintplan test
 RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src 
 
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index 063664d0c6..4f1ae64873 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -18,9 +18,6 @@ cd $(dirname $0)
 COMPUTE_CONTAINER_NAME=docker-compose-compute-1
 TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1
 PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
-: ${http_proxy:=}
-: ${https_proxy:=}
-export http_proxy https_proxy
 
 cleanup() {
     echo "show container information"
@@ -35,12 +32,6 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
     echo "clean up containers if exists"
     cleanup
     PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
-    # The support of pg_anon not yet added to PG17, so we have to add the corresponding option for other PG versions
-    if [ "${pg_version}" -ne 17 ]; then
-      SPEC_PATH="compute_wrapper/var/db/postgres/specs"
-      mv $SPEC_PATH/spec.json $SPEC_PATH/spec.bak
-      jq '.cluster.settings += [{"name": "session_preload_libraries","value": "anon","vartype": "string"}]' "${SPEC_PATH}/spec.bak" > "${SPEC_PATH}/spec.json"
-    fi
     PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
 
     echo "wait until the compute is ready. timeout after 60s. "
@@ -62,27 +53,12 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
     done
 
     if [ $pg_version -ge 16 ]; then
-        echo Enabling trust connection
-        docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
-        echo Adding postgres role
-        docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN"
         # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
         # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
         echo Adding dummy config
         docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf
-        # This block is required for the pg_anon extension test.
-        # The test assumes that it is running on the same host with the postgres engine.
-        # In our case it's not true, that's why we are copying files to the compute node
+        # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
         TMPDIR=$(mktemp -d)
-        # Add support for pg_anon for pg_v16
-        if [ $pg_version -ne 17 ]; then
-          docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
-          echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
-          docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
-        rm -rf $TMPDIR
-        fi
-        TMPDIR=$(mktemp -d)
-        # The following block does the same for the pg_hintplan test
         docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
         docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
         rm -rf $TMPDIR
@@ -106,8 +82,4 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
         fi
     fi
     cleanup
-    # Restore the original spec.json
-    if [ "$pg_version" -ne 17 ]; then
-      mv "$SPEC_PATH/spec.bak" "$SPEC_PATH/spec.json"
-    fi
 done

From 7d4bfcdc4795654100b87a02d398e41373027e44 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 21 Jan 2025 14:29:38 -0500
Subject: [PATCH 17/37] feat(pageserver): add config items for gc-compaction
 auto trigger (#10455)

## Problem

part of https://github.com/neondatabase/neon/issues/9114

The automatic trigger is already implemented at
https://github.com/neondatabase/neon/pull/10221 but I need to write some
tests and finish my experiments in staging before I can merge it with
confidence. Given that I have some other patches that will modify the
config items, I'd like to get the config items merged first to reduce
conflicts.

## Summary of changes

* add `l2_lsn` to index_part.json -- below that LSN, data have been
processed by gc-compaction
* add a set of gc-compaction auto trigger control items into the config

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/pageserver.rs               |  15 +++
 libs/pageserver_api/src/config.rs             |  16 +++
 libs/pageserver_api/src/models.rs             |  24 ++++
 pageserver/src/tenant.rs                      |   5 +
 pageserver/src/tenant/config.rs               |  36 ++++++
 .../tenant/remote_timeline_client/index.rs    | 104 +++++++++++++++++-
 .../regress/test_attach_tenant_config.py      |   3 +
 7 files changed, 201 insertions(+), 2 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index df81b44f2d..b33b2877b3 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -423,6 +423,21 @@ impl PageServerNode {
                 .map(|x| x.parse::<bool>())
                 .transpose()
                 .context("Failed to parse 'rel_size_v2_enabled' as bool")?,
+            gc_compaction_enabled: settings
+                .remove("gc_compaction_enabled")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'gc_compaction_enabled' as bool")?,
+            gc_compaction_initial_threshold_kb: settings
+                .remove("gc_compaction_initial_threshold_kb")
+                .map(|x| x.parse::<u64>())
+                .transpose()
+                .context("Failed to parse 'gc_compaction_initial_threshold_kb' as integer")?,
+            gc_compaction_ratio_percent: settings
+                .remove("gc_compaction_ratio_percent")
+                .map(|x| x.parse::<u64>())
+                .transpose()
+                .context("Failed to parse 'gc_compaction_ratio_percent' as integer")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 7fb7a9d54e..f0aeb00736 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -305,6 +305,16 @@ pub struct TenantConfigToml {
     /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
     /// `index_part.json`, and it cannot be reversed.
     pub rel_size_v2_enabled: Option<bool>,
+
+    // gc-compaction related configs
+    /// Enable automatic gc-compaction trigger on this tenant.
+    pub gc_compaction_enabled: bool,
+    /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
+    /// gc-compaction will be triggered.
+    pub gc_compaction_initial_threshold_kb: u64,
+    /// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN)
+    /// is above this ratio, gc-compaction will be triggered.
+    pub gc_compaction_ratio_percent: u64,
 }
 
 pub mod defaults {
@@ -498,6 +508,9 @@ pub mod tenant_conf_defaults {
     // By default ingest enough WAL for two new L0 layers before checking if new image
     // image layers should be created.
     pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
+    pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
+    pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 10240000;
+    pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
 }
 
 impl Default for TenantConfigToml {
@@ -543,6 +556,9 @@ impl Default for TenantConfigToml {
             timeline_offloading: false,
             wal_receiver_protocol_override: None,
             rel_size_v2_enabled: None,
+            gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
+            gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
+            gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
         }
     }
 }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 1538134c96..fd4879087f 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -499,6 +499,12 @@ pub struct TenantConfigPatch {
     pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub rel_size_v2_enabled: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_compaction_enabled: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_compaction_ratio_percent: FieldPatch<u64>,
 }
 
 /// An alternative representation of `pageserver::tenant::TenantConf` with
@@ -531,6 +537,9 @@ pub struct TenantConfig {
     pub timeline_offloading: Option<bool>,
     pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
     pub rel_size_v2_enabled: Option<bool>,
+    pub gc_compaction_enabled: Option<bool>,
+    pub gc_compaction_initial_threshold_kb: Option<u64>,
+    pub gc_compaction_ratio_percent: Option<u64>,
 }
 
 impl TenantConfig {
@@ -561,6 +570,9 @@ impl TenantConfig {
             mut timeline_offloading,
             mut wal_receiver_protocol_override,
             mut rel_size_v2_enabled,
+            mut gc_compaction_enabled,
+            mut gc_compaction_initial_threshold_kb,
+            mut gc_compaction_ratio_percent,
         } = self;
 
         patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -606,6 +618,15 @@ impl TenantConfig {
             .wal_receiver_protocol_override
             .apply(&mut wal_receiver_protocol_override);
         patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
+        patch
+            .gc_compaction_enabled
+            .apply(&mut gc_compaction_enabled);
+        patch
+            .gc_compaction_initial_threshold_kb
+            .apply(&mut gc_compaction_initial_threshold_kb);
+        patch
+            .gc_compaction_ratio_percent
+            .apply(&mut gc_compaction_ratio_percent);
 
         Self {
             checkpoint_distance,
@@ -633,6 +654,9 @@ impl TenantConfig {
             timeline_offloading,
             wal_receiver_protocol_override,
             rel_size_v2_enabled,
+            gc_compaction_enabled,
+            gc_compaction_initial_threshold_kb,
+            gc_compaction_ratio_percent,
         }
     }
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 05a311391c..e45ba2ca3b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5476,6 +5476,11 @@ pub(crate) mod harness {
                 timeline_offloading: Some(tenant_conf.timeline_offloading),
                 wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
                 rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled,
+                gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled),
+                gc_compaction_initial_threshold_kb: Some(
+                    tenant_conf.gc_compaction_initial_threshold_kb,
+                ),
+                gc_compaction_ratio_percent: Some(tenant_conf.gc_compaction_ratio_percent),
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 14d8e9ccd4..3db1445f6e 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -360,6 +360,15 @@ pub struct TenantConfOpt {
 
     #[serde(skip_serializing_if = "Option::is_none")]
     pub rel_size_v2_enabled: Option<bool>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gc_compaction_enabled: Option<bool>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gc_compaction_initial_threshold_kb: Option<u64>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gc_compaction_ratio_percent: Option<u64>,
 }
 
 impl TenantConfOpt {
@@ -429,6 +438,15 @@ impl TenantConfOpt {
                 .wal_receiver_protocol_override
                 .or(global_conf.wal_receiver_protocol_override),
             rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled),
+            gc_compaction_enabled: self
+                .gc_compaction_enabled
+                .unwrap_or(global_conf.gc_compaction_enabled),
+            gc_compaction_initial_threshold_kb: self
+                .gc_compaction_initial_threshold_kb
+                .unwrap_or(global_conf.gc_compaction_initial_threshold_kb),
+            gc_compaction_ratio_percent: self
+                .gc_compaction_ratio_percent
+                .unwrap_or(global_conf.gc_compaction_ratio_percent),
         }
     }
 
@@ -459,6 +477,9 @@ impl TenantConfOpt {
             mut timeline_offloading,
             mut wal_receiver_protocol_override,
             mut rel_size_v2_enabled,
+            mut gc_compaction_enabled,
+            mut gc_compaction_initial_threshold_kb,
+            mut gc_compaction_ratio_percent,
         } = self;
 
         patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -528,6 +549,15 @@ impl TenantConfOpt {
             .wal_receiver_protocol_override
             .apply(&mut wal_receiver_protocol_override);
         patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
+        patch
+            .gc_compaction_enabled
+            .apply(&mut gc_compaction_enabled);
+        patch
+            .gc_compaction_initial_threshold_kb
+            .apply(&mut gc_compaction_initial_threshold_kb);
+        patch
+            .gc_compaction_ratio_percent
+            .apply(&mut gc_compaction_ratio_percent);
 
         Ok(Self {
             checkpoint_distance,
@@ -555,6 +585,9 @@ impl TenantConfOpt {
             timeline_offloading,
             wal_receiver_protocol_override,
             rel_size_v2_enabled,
+            gc_compaction_enabled,
+            gc_compaction_initial_threshold_kb,
+            gc_compaction_ratio_percent,
         })
     }
 }
@@ -611,6 +644,9 @@ impl From<TenantConfOpt> for models::TenantConfig {
             timeline_offloading: value.timeline_offloading,
             wal_receiver_protocol_override: value.wal_receiver_protocol_override,
             rel_size_v2_enabled: value.rel_size_v2_enabled,
+            gc_compaction_enabled: value.gc_compaction_enabled,
+            gc_compaction_initial_threshold_kb: value.gc_compaction_initial_threshold_kb,
+            gc_compaction_ratio_percent: value.gc_compaction_ratio_percent,
         }
     }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 30b6b07ca3..3824bc8f11 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -77,11 +77,17 @@ pub struct IndexPart {
     ///
     /// None means no aux files have been written to the storage before the point
     /// when this flag is introduced.
+    ///
+    /// This flag is not used any more as all tenants have been transitioned to the new aux file policy.
     #[serde(skip_serializing_if = "Option::is_none", default)]
     pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
 
     #[serde(skip_serializing_if = "Option::is_none", default)]
     pub(crate) rel_size_migration: Option<RelSizeMigration>,
+
+    /// The LSN of gc-compaction horizon. Once gc-compaction is finished for all layer files below an LSN, this LSN will be updated.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub(crate) l2_lsn: Option<Lsn>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -116,10 +122,11 @@ impl IndexPart {
     /// - 9: +gc_blocking
     /// - 10: +import_pgdata
     /// - 11: +rel_size_migration
-    const LATEST_VERSION: usize = 11;
+    /// - 12: +l2_lsn
+    const LATEST_VERSION: usize = 12;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
@@ -136,6 +143,7 @@ impl IndexPart {
             last_aux_file_policy: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         }
     }
 
@@ -437,6 +445,7 @@ mod tests {
             last_aux_file_policy: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -483,6 +492,7 @@ mod tests {
             last_aux_file_policy: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -530,6 +540,7 @@ mod tests {
             last_aux_file_policy: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -580,6 +591,7 @@ mod tests {
             last_aux_file_policy: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -625,6 +637,7 @@ mod tests {
             last_aux_file_policy: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -673,6 +686,7 @@ mod tests {
             last_aux_file_policy: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -726,6 +740,7 @@ mod tests {
             last_aux_file_policy: Some(AuxFilePolicy::V2),
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -784,6 +799,7 @@ mod tests {
             last_aux_file_policy: Default::default(),
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -843,6 +859,7 @@ mod tests {
             last_aux_file_policy: Default::default(),
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -907,6 +924,7 @@ mod tests {
             archived_at: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -984,6 +1002,7 @@ mod tests {
                 idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
             }))),
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -1062,6 +1081,87 @@ mod tests {
                 idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
             }))),
             rel_size_migration: Some(RelSizeMigration::Legacy),
+            l2_lsn: None,
+        };
+
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    #[test]
+    fn v12_l2_lsn_is_parsed() {
+        let example = r#"{
+            "version": 12,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata": {
+                "disk_consistent_lsn": "0/16960E8",
+                "prev_record_lsn": "0/1696070",
+                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/1696070",
+                "initdb_lsn": "0/1696070",
+                "pg_version": 14
+            },
+            "gc_blocking": {
+                "started_at": "2024-07-19T09:00:00.123",
+                "reasons": ["DetachAncestor"]
+            },
+            "import_pgdata": {
+                "V1": {
+                    "Done": {
+                        "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5",
+                        "started_at": "2024-11-13T09:23:42.123",
+                        "finished_at": "2024-11-13T09:42:23.123"
+                    }
+                }
+            },
+            "rel_size_migration": "legacy",
+            "l2_lsn": "0/16960E8"
+        }"#;
+
+        let expected = IndexPart {
+            version: 12,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::new(
+                Lsn::from_str("0/16960E8").unwrap(),
+                Some(Lsn::from_str("0/1696070").unwrap()),
+                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
+                Lsn::INVALID,
+                Lsn::from_str("0/1696070").unwrap(),
+                Lsn::from_str("0/1696070").unwrap(),
+                14,
+            ).with_recalculated_checksum().unwrap(),
+            deleted_at: None,
+            lineage: Default::default(),
+            gc_blocking: Some(GcBlocking {
+                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
+                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
+            }),
+            last_aux_file_policy: Default::default(),
+            archived_at: None,
+            import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{
+                started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"),
+                finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"),
+                idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
+            }))),
+            rel_size_migration: Some(RelSizeMigration::Legacy),
+            l2_lsn: Some("0/16960E8".parse::<Lsn>().unwrap()),
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index b34dbddc80..b8d47346a3 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -177,6 +177,9 @@ def test_fully_custom_config(positive_env: NeonEnv):
             "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
         },
         "rel_size_v2_enabled": True,
+        "gc_compaction_enabled": True,
+        "gc_compaction_initial_threshold_kb": 1024000,
+        "gc_compaction_ratio_percent": 200,
     }
 
     vps_http = env.storage_controller.pageserver_api()

From a75e11cc002aae79efd3d8b46fe9c7be96eca326 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 21 Jan 2025 21:56:34 +0100
Subject: [PATCH 18/37] pageserver: return duration from
 `StorageTimeMetricsTimer` (#10468)

## Problem

It's sometimes useful to obtain the elapsed duration from a
`StorageTimeMetricsTimer` for purposes beyond just recording it in
metrics (e.g. to log it).

Extracted from #10405.

## Summary of changes

Add `StorageTimeMetricsTimer.elapsed()` and return the duration from
`stop_and_record()`.
---
 pageserver/src/metrics.rs | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 4758aaf230..252e566f70 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2550,12 +2550,19 @@ impl StorageTimeMetricsTimer {
         }
     }
 
-    /// Record the time from creation to now.
-    pub fn stop_and_record(self) {
-        let duration = self.start.elapsed().as_secs_f64();
-        self.metrics.timeline_sum.inc_by(duration);
+    /// Returns the elapsed duration of the timer.
+    pub fn elapsed(&self) -> Duration {
+        self.start.elapsed()
+    }
+
+    /// Record the time from creation to now and return it.
+    pub fn stop_and_record(self) -> Duration {
+        let duration = self.elapsed();
+        let seconds = duration.as_secs_f64();
+        self.metrics.timeline_sum.inc_by(seconds);
         self.metrics.timeline_count.inc();
-        self.metrics.global_histogram.observe(duration);
+        self.metrics.global_histogram.observe(seconds);
+        duration
     }
 
     /// Turns this timer into a timer, which will always record -- usually this means recording
@@ -2575,6 +2582,14 @@ impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
     }
 }
 
+impl AlwaysRecordingStorageTimeMetricsTimer {
+    /// Returns the elapsed duration of the timer.
+    #[allow(unused)]
+    pub fn elapsed(&self) -> Duration {
+        self.0.as_ref().expect("not dropped yet").elapsed()
+    }
+}
+
 /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
 /// timeline total sum and count.
 #[derive(Clone, Debug)]

From 8a8c656c0646895fbfc7a7bc5df4a93917071032 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 21 Jan 2025 22:18:09 +0100
Subject: [PATCH 19/37] pageserver: add `LayerMap::watch_layer0_deltas()`
 (#10470)

## Problem

For compaction backpressure, we need a mechanism to signal when
compaction has reduced the L0 delta layer count below the backpressure
threshold.

Extracted from #10405.

## Summary of changes

Add `LayerMap::watch_level0_deltas()` which returns a
`tokio::sync::watch::Receiver` signalling the current L0 delta layer
count.
---
 pageserver/src/tenant/layer_map.rs | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 1b6924425c..a69cce932e 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -57,6 +57,7 @@ use std::collections::{HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
+use tokio::sync::watch;
 use utils::lsn::Lsn;
 
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
@@ -67,7 +68,6 @@ use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
 ///
 /// LayerMap tracks what layers exist on a timeline.
 ///
-#[derive(Default)]
 pub struct LayerMap {
     //
     // 'open_layer' holds the current InMemoryLayer that is accepting new
@@ -93,7 +93,25 @@ pub struct LayerMap {
 
     /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
     /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
+    ///
+    /// NB: make sure to notify `watch_l0_deltas` on changes.
     l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
+
+    /// Notifies about L0 delta layer changes, sending the current number of L0 layers.
+    watch_l0_deltas: watch::Sender<usize>,
+}
+
+impl Default for LayerMap {
+    fn default() -> Self {
+        Self {
+            open_layer: Default::default(),
+            next_open_layer_at: Default::default(),
+            frozen_layers: Default::default(),
+            historic: Default::default(),
+            l0_delta_layers: Default::default(),
+            watch_l0_deltas: watch::channel(0).0,
+        }
+    }
 }
 
 /// The primary update API for the layer map.
@@ -466,6 +484,8 @@ impl LayerMap {
 
         if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
             self.l0_delta_layers.push(layer_desc.clone().into());
+            self.watch_l0_deltas
+                .send_replace(self.l0_delta_layers.len());
         }
 
         self.historic.insert(
@@ -488,6 +508,8 @@ impl LayerMap {
             let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
             l0_delta_layers.retain(|other| other.key() != layer_key);
             self.l0_delta_layers = l0_delta_layers;
+            self.watch_l0_deltas
+                .send_replace(self.l0_delta_layers.len());
             // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
             // there's a chance that the comparison fails at runtime due to it comparing (pointer,
             // vtable) pairs.
@@ -850,6 +872,11 @@ impl LayerMap {
         &self.l0_delta_layers
     }
 
+    /// Subscribes to L0 delta layer changes, sending the current number of L0 delta layers.
+    pub fn watch_level0_deltas(&self) -> watch::Receiver<usize> {
+        self.watch_l0_deltas.subscribe()
+    }
+
     /// debugging function to print out the contents of the layer map
     #[allow(unused)]
     pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {

From 14e1f89053b87a38f0dd697f5c32721380d19a9a Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 21 Jan 2025 23:01:27 +0100
Subject: [PATCH 20/37] pageserver: eagerly notify flush waiters (#10469)

## Problem

Currently, the layer flush loop will continue flushing layers as long as
any are pending, and only notify waiters once there are no further
layers to flush. This can cause waiters to wait longer than necessary,
and potentially starve them if pending layers keep arriving faster than
they can be flushed. The impact of this will increase when we add
compaction backpressure and propagate it up into the WAL receiver.

Extracted from #10405.

## Summary of changes

Break out of the layer flush loop once we've flushed up to the requested
LSN. If further flush requests have arrived in the meanwhile, flushing
will resume immediately after.
---
 pageserver/src/tenant/timeline.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5f4272fb2b..3245f23a28 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3617,6 +3617,12 @@ impl Timeline {
                     return;
                 }
 
+                // Break to notify potential waiters as soon as we've flushed the requested LSN. If
+                // more requests have arrived in the meanwhile, we'll resume flushing afterwards.
+                if flushed_to_lsn >= frozen_to_lsn {
+                    break Ok(());
+                }
+
                 let timer = self.metrics.flush_time_histo.start_timer();
 
                 let num_frozen_layers;

From 2b49d6ee050f41cb67d34e4f117196b3d01a2394 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 22 Jan 2025 09:15:52 +0000
Subject: [PATCH 21/37] feat: adjust the tonic features to remove axum
 dependency (#10348)

To help facilitate an upgrade to axum 0.8
(https://github.com/neondatabase/neon/pull/10332#pullrequestreview-2541989619)
this massages the tonic dependency features so that tonic does not
depend on axum.
---
 Cargo.lock                               |  8 -------
 Cargo.toml                               |  2 +-
 libs/wal_decoder/Cargo.toml              |  1 -
 libs/wal_decoder/src/models.rs           |  2 +-
 storage_broker/src/bin/storage_broker.rs | 29 ++++++++++++++----------
 workspace_hack/Cargo.toml                | 11 ++++-----
 6 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 02b02a09c1..2020c417f0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7004,12 +7004,9 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
 dependencies = [
- "async-stream",
  "async-trait",
- "axum",
  "base64 0.22.1",
  "bytes",
- "h2 0.4.4",
  "http 1.1.0",
  "http-body 1.0.0",
  "http-body-util",
@@ -7021,7 +7018,6 @@ dependencies = [
  "prost",
  "rustls-native-certs 0.8.0",
  "rustls-pemfile 2.1.1",
- "socket2",
  "tokio",
  "tokio-rustls 0.26.0",
  "tokio-stream",
@@ -7582,7 +7578,6 @@ dependencies = [
  "tikv-jemallocator",
  "tokio",
  "tokio-util",
- "tonic",
  "tonic-build",
  "tracing",
  "utils",
@@ -7991,8 +7986,6 @@ version = "0.1.0"
 dependencies = [
  "ahash",
  "anyhow",
- "axum",
- "axum-core",
  "base64 0.13.1",
  "base64 0.21.1",
  "base64ct",
@@ -8073,7 +8066,6 @@ dependencies = [
  "toml_edit",
  "tonic",
  "tower 0.4.13",
- "tower 0.5.2",
  "tracing",
  "tracing-core",
  "url",
diff --git a/Cargo.toml b/Cargo.toml
index a4e601bb58..6e1e288895 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -187,7 +187,7 @@ tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.8"
 toml_edit = "0.22"
-tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
+tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
 tower = { version = "0.5.2", default-features = false }
 tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
 tower-service = "0.3.3"
diff --git a/libs/wal_decoder/Cargo.toml b/libs/wal_decoder/Cargo.toml
index 09c4afb18a..cb0ef4b00d 100644
--- a/libs/wal_decoder/Cargo.toml
+++ b/libs/wal_decoder/Cargo.toml
@@ -17,7 +17,6 @@ postgres_ffi.workspace = true
 serde.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["io-util"] }
-tonic.workspace = true
 tracing.workspace = true
 utils.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
index c2f9125b21..51bf7e44ab 100644
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -45,7 +45,7 @@ pub mod proto {
     #![allow(clippy::derive_partial_eq_without_eq)]
     // The generated ValueMeta has a `len` method generate for its `len` field.
     #![allow(clippy::len_without_is_empty)]
-    tonic::include_proto!("interpreted_wal");
+    include!(concat!(env!("OUT_DIR"), concat!("/interpreted_wal.rs")));
 }
 
 #[derive(Copy, Clone, Serialize, Deserialize)]
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 1fbb651656..9d4c22484c 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -32,7 +32,6 @@ use tokio::sync::broadcast::error::RecvError;
 use tokio::time;
 use tonic::body::{self, empty_body, BoxBody};
 use tonic::codegen::Service;
-use tonic::transport::server::Connected;
 use tonic::Code;
 use tonic::{Request, Response, Status};
 use tracing::*;
@@ -459,9 +458,10 @@ impl BrokerService for Broker {
         &self,
         request: Request<tonic::Streaming<SafekeeperTimelineInfo>>,
     ) -> Result<Response<()>, Status> {
-        let remote_addr = request
-            .remote_addr()
-            .expect("TCPConnectInfo inserted by handler");
+        let &RemoteAddr(remote_addr) = request
+            .extensions()
+            .get()
+            .expect("RemoteAddr inserted by handler");
         let mut publisher = self.registry.register_publisher(remote_addr);
 
         let mut stream = request.into_inner();
@@ -484,9 +484,10 @@ impl BrokerService for Broker {
         &self,
         request: Request<SubscribeSafekeeperInfoRequest>,
     ) -> Result<Response<Self::SubscribeSafekeeperInfoStream>, Status> {
-        let remote_addr = request
-            .remote_addr()
-            .expect("TCPConnectInfo inserted by handler");
+        let &RemoteAddr(remote_addr) = request
+            .extensions()
+            .get()
+            .expect("RemoteAddr inserted by handler");
         let proto_key = request
             .into_inner()
             .subscription_key
@@ -537,9 +538,10 @@ impl BrokerService for Broker {
         &self,
         request: Request<SubscribeByFilterRequest>,
     ) -> std::result::Result<Response<Self::SubscribeByFilterStream>, Status> {
-        let remote_addr = request
-            .remote_addr()
-            .expect("TCPConnectInfo inserted by handler");
+        let &RemoteAddr(remote_addr) = request
+            .extensions()
+            .get()
+            .expect("RemoteAddr inserted by handler");
         let proto_filter = request.into_inner();
         let ttid_filter = proto_filter.tenant_timeline_id.as_ref();
 
@@ -628,6 +630,9 @@ async fn http1_handler(
     Ok(resp)
 }
 
+#[derive(Clone, Copy)]
+struct RemoteAddr(SocketAddr);
+
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
     let args = Args::parse();
@@ -687,13 +692,13 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
             .max_concurrent_streams(None);
 
         let storage_broker_server_cloned = storage_broker_server.clone();
-        let connect_info = stream.connect_info();
+        let remote_addr = RemoteAddr(addr);
         let service_fn_ = async move {
             service_fn(move |mut req| {
                 // That's what tonic's MakeSvc.call does to pass conninfo to
                 // the request handler (and where its request.remote_addr()
                 // expects it to find).
-                req.extensions_mut().insert(connect_info.clone());
+                req.extensions_mut().insert(remote_addr);
 
                 // Technically this second clone is not needed, but consume
                 // by async block is apparently unavoidable. BTW, error
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 0ffeeead18..a3dffa8f19 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -17,8 +17,6 @@ license.workspace = true
 [dependencies]
 ahash = { version = "0.8" }
 anyhow = { version = "1", features = ["backtrace"] }
-axum = { version = "0.7", features = ["ws"] }
-axum-core = { version = "0.4", default-features = false, features = ["tracing"] }
 base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] }
 base64-647d43efb71741da = { package = "base64", version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }
@@ -46,7 +44,7 @@ hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] }
 hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] }
-hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] }
+hyper-util = { version = "0.1", features = ["client-legacy", "http1", "http2", "server", "service"] }
 indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
 indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
 itertools = { version = "0.12" }
@@ -87,12 +85,11 @@ tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unpref
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["full", "test-util"] }
 tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
-tokio-stream = { version = "0.1", features = ["net"] }
+tokio-stream = { version = "0.1" }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
 toml_edit = { version = "0.22", features = ["serde"] }
-tonic = { version = "0.12", features = ["tls-roots"] }
-tower-9fbad63c4bcf4a8f = { package = "tower", version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] }
-tower-d8f496e17d97b5cb = { package = "tower", version = "0.5", default-features = false, features = ["log", "make", "util"] }
+tonic = { version = "0.12", default-features = false, features = ["codegen", "prost", "tls-roots"] }
+tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
 url = { version = "2", features = ["serde"] }

From b4d87b9dfedc3d08d00091e2407d0996a9ea2026 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 22 Jan 2025 11:10:43 +0100
Subject: [PATCH 22/37] fix(tests): actually enable pipelinig by default in the
 test suite (#10472)

## Problem

PR #9993 was supposed to enable `page_service_pipelining` by default for
all `NeonEnv`s, but this was ineffective in our CI environment.

Thus, CI Python-based tests and benchmarks, unless explicitly
configuring pipelining, were still using serial protocol handling.

## Analysis

The root cause was that in our CI environment,
`config.compatibility_neon_binpath` is always Truthy.
It's not in local environments, which is why this slipped through in
local testing.

Lesson: always add a log line ot pageserver startup and spot-check tests
to ensure the intended default is picked up.

## Summary of changes

Fix it. Since enough time has passed, the compatiblity snapshot contains
a recent enough software version so we don't need to worry about
`compatibility_neon_binpath` anymore.

## Future Work

The question how to add a new default except for compatibliity tests,
which is what the broken code was supposed to do, is still unsolved.

Slack discussion:
https://neondb.slack.com/archives/C059ZC138NR/p1737490501941309
---
 test_runner/fixtures/neon_fixtures.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a01cb47984..d79c2a5ea8 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1115,13 +1115,11 @@ class NeonEnv:
 
             # Batching (https://github.com/neondatabase/neon/issues/9377):
             # enable batching by default in tests and benchmarks.
-            # Compat tests are exempt because old versions fail to parse the new config.
-            if not config.compatibility_neon_binpath:
-                ps_cfg["page_service_pipelining"] = {
-                    "mode": "pipelined",
-                    "execution": "concurrent-futures",
-                    "max_batch_size": 32,
-                }
+            ps_cfg["page_service_pipelining"] = {
+                "mode": "pipelined",
+                "execution": "concurrent-futures",
+                "max_batch_size": 32,
+            }
 
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine

From b31ce14083d3d4130e1190daadbf085082ac7280 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 22 Jan 2025 13:28:26 +0100
Subject: [PATCH 23/37] initial logical size calculation: always poll to
 completion (#10471)

# Refs

- extracted from https://github.com/neondatabase/neon/pull/9353

# Problem

Before this PR, when task_mgr shutdown is signalled, e.g. during
pageserver shutdown or Tenant shutdown, initial logical size calculation
stops polling and drops the future that represents the calculation.

This is against the current policy that we poll all futures to
completion.

This became apparent during development of concurrent IO which warns if
we drop a `Timeline::get_vectored` future that still has in-flight IOs.

We may revise the policy in the future, but, right now initial logical
size calculation is the only part of the codebase that doesn't adhere to
the policy, so let's fix it.

## Code Changes

- make sensitive exclusively to `Timeline::cancel`
- This should be sufficient for all cases of shutdowns; the sensitivity
to task_mgr shutdown is unnecessary.
- this broke the various cancel tests in `test_timeline_size.py`, e.g.,
`test_timeline_initial_logical_size_calculation_cancellation`
- the tests would time out because the await point was not sensitive to
cancellation
- to fix this, refactor `pausable_failpoint` so that it accepts a
cancellation token
- side note: we _really_ should write our own failpoint library; maybe
after we get heap-allocated RequestContext, we can plumb failpoints
through there.
---
 libs/utils/src/failpoint_support.rs | 54 ++++++++++++++++-------
 pageserver/src/tenant/tasks.rs      |  7 ++-
 pageserver/src/tenant/timeline.rs   | 67 +++++++++++------------------
 3 files changed, 66 insertions(+), 62 deletions(-)

diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs
index 701ba2d42c..272c6ebb26 100644
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -11,31 +11,55 @@ use tracing::*;
 
 /// Declare a failpoint that can use to `pause` failpoint action.
 /// We don't want to block the executor thread, hence, spawn_blocking + await.
+///
+/// Optionally pass a cancellation token, and this failpoint will drop out of
+/// its pause when the cancellation token fires. This is useful for testing
+/// cases where we would like to block something, but test its clean shutdown behavior.
+/// The macro evaluates to a Result in that case, where Ok(()) is the case
+/// where the failpoint was not paused, and Err() is the case where cancellation
+/// token fired while evaluating the failpoint.
+///
+/// Remember to unpause the failpoint in the test; until that happens, one of the
+/// limited number of spawn_blocking thread pool threads is leaked.
 #[macro_export]
 macro_rules! pausable_failpoint {
-    ($name:literal) => {
+    ($name:literal) => {{
         if cfg!(feature = "testing") {
-            tokio::task::spawn_blocking({
-                let current = tracing::Span::current();
+            let cancel = ::tokio_util::sync::CancellationToken::new();
+            let _ = $crate::pausable_failpoint!($name, &cancel);
+        }
+    }};
+    ($name:literal, $cancel:expr) => {{
+        if cfg!(feature = "testing") {
+            let failpoint_fut = ::tokio::task::spawn_blocking({
+                let current = ::tracing::Span::current();
                 move || {
                     let _entered = current.entered();
-                    tracing::info!("at failpoint {}", $name);
-                    fail::fail_point!($name);
+                    ::tracing::info!("at failpoint {}", $name);
+                    ::fail::fail_point!($name);
+                }
+            });
+            let cancel_fut = async move {
+                $cancel.cancelled().await;
+            };
+            ::tokio::select! {
+                res = failpoint_fut => {
+                    res.expect("spawn_blocking");
+                    // continue with execution
+                    Ok(())
+                },
+                _ = cancel_fut => {
+                    Err(())
                 }
-            })
-            .await
-            .expect("spawn_blocking");
-        }
-    };
-    ($name:literal, $cond:expr) => {
-        if cfg!(feature = "testing") {
-            if $cond {
-                pausable_failpoint!($name)
             }
+        } else {
+            Ok(())
         }
-    };
+    }};
 }
 
+pub use pausable_failpoint;
+
 /// use with fail::cfg("$name", "return(2000)")
 ///
 /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 0118a5ce5f..3725e2f7fc 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -67,10 +67,9 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
 ) -> tokio::sync::SemaphorePermit<'static> {
     let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
 
-    pausable_failpoint!(
-        "initial-size-calculation-permit-pause",
-        loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation
-    );
+    if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
+        pausable_failpoint!("initial-size-calculation-permit-pause");
+    }
 
     // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
     match CONCURRENT_BACKGROUND_TASKS.acquire().await {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3245f23a28..e83b516d79 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -60,6 +60,7 @@ use utils::{
 use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
 use std::sync::atomic::Ordering as AtomicOrdering;
+use std::sync::OnceLock;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 use std::{
@@ -72,7 +73,6 @@ use std::{
     collections::btree_map::Entry,
     ops::{Deref, Range},
 };
-use std::{pin::pin, sync::OnceLock};
 
 use crate::{
     aux_file::AuxFileSizeEstimator,
@@ -2804,12 +2804,10 @@ impl Timeline {
             "initial size calculation",
             // NB: don't log errors here, task_mgr will do that.
             async move {
-                let cancel = task_mgr::shutdown_token();
                 self_clone
                     .initial_logical_size_calculation_task(
                         initial_part_end,
                         cancel_wait_for_background_loop_concurrency_limit_semaphore,
-                        cancel,
                         background_ctx,
                     )
                     .await;
@@ -2819,11 +2817,21 @@ impl Timeline {
         );
     }
 
+    /// # Cancellation
+    ///
+    /// This method is sensitive to `Timeline::cancel`.
+    ///
+    /// It is _not_ sensitive to task_mgr::shutdown_token().
+    ///
+    /// # Cancel-Safety
+    ///
+    /// It does Timeline IO, hence this should be polled to completion because
+    /// we could be leaving in-flight IOs behind, which is safe, but annoying
+    /// to reason about.
     async fn initial_logical_size_calculation_task(
         self: Arc<Self>,
         initial_part_end: Lsn,
         skip_concurrency_limiter: CancellationToken,
-        cancel: CancellationToken,
         background_ctx: RequestContext,
     ) {
         scopeguard::defer! {
@@ -2836,7 +2844,6 @@ impl Timeline {
             let self_ref = &self;
             let skip_concurrency_limiter = &skip_concurrency_limiter;
             async move {
-                let cancel = task_mgr::shutdown_token();
                 let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
                     BackgroundLoopKind::InitialLogicalSizeCalculation,
                     background_ctx,
@@ -2850,9 +2857,6 @@ impl Timeline {
                     _ = self_ref.cancel.cancelled() => {
                         return Err(CalculateLogicalSizeError::Cancelled);
                     }
-                    _ = cancel.cancelled() => {
-                        return Err(CalculateLogicalSizeError::Cancelled);
-                    },
                     () = skip_concurrency_limiter.cancelled() => {
                         // Some action that is part of a end user interaction requested logical size
                         // => break out of the rate limit
@@ -2911,22 +2915,18 @@ impl Timeline {
                             )
                             .expect("10min < 1hour"),
                         );
-                        tokio::time::sleep(sleep_duration).await;
+                        tokio::select! {
+                            _ = tokio::time::sleep(sleep_duration) => {}
+                            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
+                        }
                     }
                 }
             }
         };
 
-        let (calculated_size, metrics_guard) = tokio::select! {
-            res = retrying  => {
-                match res {
-                    ControlFlow::Continue(calculated_size) => calculated_size,
-                    ControlFlow::Break(()) => return,
-                }
-            }
-            _ = cancel.cancelled() => {
-                return;
-            }
+        let (calculated_size, metrics_guard) = match retrying.await {
+            ControlFlow::Continue(calculated_size) => calculated_size,
+            ControlFlow::Break(()) => return,
         };
 
         // we cannot query current_logical_size.current_size() to know the current
@@ -2982,9 +2982,6 @@ impl Timeline {
         receiver
     }
 
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
     #[instrument(skip_all)]
     async fn logical_size_calculation_task(
         self: &Arc<Self>,
@@ -3002,32 +2999,13 @@ impl Timeline {
             .enter()
             .map_err(|_| CalculateLogicalSizeError::Cancelled)?;
 
-        let self_calculation = Arc::clone(self);
-
-        let mut calculation = pin!(async {
-            let ctx = ctx.attached_child();
-            self_calculation
-                .calculate_logical_size(lsn, cause, &guard, &ctx)
-                .await
-        });
-
-        tokio::select! {
-            res = &mut calculation => { res }
-            _ = self.cancel.cancelled() => {
-                debug!("cancelling logical size calculation for timeline shutdown");
-                calculation.await
-            }
-        }
+        self.calculate_logical_size(lsn, cause, &guard, ctx).await
     }
 
     /// Calculate the logical size of the database at the latest LSN.
     ///
     /// NOTE: counted incrementally, includes ancestors. This can be a slow operation,
     /// especially if we need to download remote layers.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
     async fn calculate_logical_size(
         &self,
         up_to_lsn: Lsn,
@@ -3040,7 +3018,10 @@ impl Timeline {
             self.timeline_id, up_to_lsn
         );
 
-        pausable_failpoint!("timeline-calculate-logical-size-pause");
+        if let Err(()) = pausable_failpoint!("timeline-calculate-logical-size-pause", &self.cancel)
+        {
+            return Err(CalculateLogicalSizeError::Cancelled);
+        }
 
         // See if we've already done the work for initial size calculation.
         // This is a short-cut for timelines that are mostly unused.

From 881e351f693ae1d177d57b171569d327d0abcfe8 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 22 Jan 2025 13:38:23 +0100
Subject: [PATCH 24/37] feat(compute): Allow installing both 0.8.0 and 0.7.4
 pgvector (#10345)

## Problem

Both these versions are binary compatible, but the way pgvector
structures the SQL files forbids installing 0.7.4 if you have a 0.8.0
distribution. Yet, some users may need a previous version for backward
compatibility, e.g., restoring the dump.

See this thread for discussion

https://neondb.slack.com/archives/C04DGM6SMTM/p1735911490242919?thread_ts=1731343604.259169&cid=C04DGM6SMTM

## Summary of changes

Put `vector--0.7.4.sql` file into compute image to allow installing this
version as well.

Tested on staging and it seems to be working as expected:
```sql
select * from pg_available_extensions where name = 'vector';
  name  | default_version | installed_version |                       comment
--------+-----------------+-------------------+------------------------------------------------------
 vector | 0.8.0           | (null)            | vector data type and ivfflat and hnsw access methods

create extension vector version '0.7.4';

select * from pg_available_extensions where name = 'vector';
  name  | default_version | installed_version |                       comment
--------+-----------------+-------------------+------------------------------------------------------
 vector | 0.8.0           | 0.7.4             | vector data type and ivfflat and hnsw access methods

alter extension vector update;

select * from pg_available_extensions where name = 'vector';
  name  | default_version | installed_version |                       comment
--------+-----------------+-------------------+------------------------------------------------------
 vector | 0.8.0           | 0.8.0             | vector data type and ivfflat and hnsw access methods

drop extension vector;
create extension vector;

select * from pg_available_extensions where name = 'vector';
  name  | default_version | installed_version |                       comment
--------+-----------------+-------------------+------------------------------------------------------
 vector | 0.8.0           | 0.8.0             | vector data type and ivfflat and hnsw access methods
```

If we find out it's a good approach, we can adopt the same for other
extensions with a stable ABI -- support both `current` and `current - 1`
releases.
---
 compute/compute-node.Dockerfile |  2 ++
 compute/patches/pgvector.patch  | 22 +++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index dbe7de046b..706c947008 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -360,6 +360,8 @@ COPY compute/patches/pgvector.patch /pgvector.patch
 RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O pgvector.tar.gz && \
     echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    wget https://github.com/pgvector/pgvector/raw/refs/tags/v0.7.4/sql/vector.sql -O ./sql/vector--0.7.4.sql && \
+    echo "10218d05dc02299562252a9484775178b14a1d8edb92a2d1672ef488530f7778 ./sql/vector--0.7.4.sql" | sha256sum --check && \
     patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \
diff --git a/compute/patches/pgvector.patch b/compute/patches/pgvector.patch
index 3e1ffcaaaf..da41c86140 100644
--- a/compute/patches/pgvector.patch
+++ b/compute/patches/pgvector.patch
@@ -1,8 +1,24 @@
+diff --git a/Makefile b/Makefile
+index 7a4b88c..56678af 100644
+--- a/Makefile
++++ b/Makefile
+@@ -3,7 +3,10 @@ EXTVERSION = 0.8.0
+ 
+ MODULE_big = vector
+ DATA = $(wildcard sql/*--*--*.sql)
+-DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql
++# This change is needed to install different per-version SQL files
++# like pgvector--0.8.0.sql and pgvector--0.7.4.sql
++# The corresponding file is downloaded during the Docker image build process
++DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql sql/vector--0.7.4.sql
+ OBJS = src/bitutils.o src/bitvec.o src/halfutils.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/sparsevec.o src/vector.o
+ HEADERS = src/halfvec.h src/sparsevec.h src/vector.h
+ 
 diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index dcfb2bd..d5189ee 100644
+index b667478..fc1897c 100644
 --- a/src/hnswbuild.c
 +++ b/src/hnswbuild.c
-@@ -860,9 +860,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
+@@ -843,9 +843,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
  
  	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
  
@@ -20,7 +36,7 @@ index dcfb2bd..d5189ee 100644
  	/* Close relations within worker */
  	index_close(indexRel, indexLockmode);
  	table_close(heapRel, heapLockmode);
-@@ -1117,12 +1125,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+@@ -1100,12 +1108,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
  	SeedRandom(42);
  #endif
  

From 414ed82c1fec460a9e2e460a32ef1122aac8a32c Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 22 Jan 2025 15:30:23 +0000
Subject: [PATCH 25/37] pageserver: issue concurrent IO on the read path
 (#9353)

## Refs

- Epic: https://github.com/neondatabase/neon/issues/9378

Co-authored-by: Vlad Lazar <vlad@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>

## Problem

The read path does its IOs sequentially.
This means that if N values need to be read to reconstruct a page,
we will do N IOs and getpage latency is `O(N*IoLatency)`.

## Solution

With this PR we gain the ability to issue IO concurrently within one
layer visit **and** to move on to the next layer without waiting for IOs
from the previous visit to complete.

This is an evolved version of the work done at the Lisbon hackathon,
cf https://github.com/neondatabase/neon/pull/9002.

## Design

### `will_init` now sourced from disk btree index keys

On the algorithmic level, the only change is that the
`get_values_reconstruct_data`
now sources `will_init` from the disk btree index key (which is
PS-page_cache'd), instead
of from the `Value`, which is only available after the IO completes.

### Concurrent IOs, Submission & Completion

To separate IO submission from waiting for its completion, while
simultaneously
feature-gating the change, we introduce the notion of an `IoConcurrency`
struct
through which IO futures are "spawned".

An IO is an opaque future, and waiting for completions is handled
through
`tokio::sync::oneshot` channels.
The oneshot Receiver's take the place of the `img` and `records` fields
inside `VectoredValueReconstructState`.

When we're done visiting all the layers and submitting all the IOs along
the way
we concurrently `collect_pending_ios` for each value, which means
for each value there is a future that awaits all the oneshot receivers
and then calls into walredo to reconstruct the page image.
Walredo is now invoked concurrently for each value instead of
sequentially.
Walredo itself remains unchanged.

The spawned IO futures are driven to completion by a sidecar tokio task
that
is separate from the task that performs all the layer visiting and
spawning of IOs.
That tasks receives the IO futures via an unbounded mpsc channel and
drives them to completion inside a `FuturedUnordered`.

(The behavior from before this PR is available through
`IoConcurrency::Sequential`,
which awaits the IO futures in place, without "spawning" or "submitting"
them
anywhere.)

#### Alternatives Explored

A few words on the rationale behind having a sidecar *task* and what
alternatives were considered.

One option is to queue up all IO futures in a FuturesUnordered that is
polled
the first time when we `collect_pending_ios`.

Firstly, the IO futures are opaque, compiler-generated futures that need
to be polled at least once to submit their IO. "At least once" because
tokio-epoll-uring may not be able to submit the IO to the kernel on
first
poll right away.

Second, there are deadlocks if we don't drive the IO futures to
completion
independently of the spawning task.
The reason is that both the IO futures and the spawning task may hold
some
_and_ try to acquire _more_ shared limited resources.
For example, both spawning task and IO future may try to acquire
* a VirtualFile file descriptor cache slot async mutex (observed during
impl)
* a tokio-epoll-uring submission slot (observed during impl)
* a PageCache slot (currently this is not the case but we may move more
code into the IO futures in the future)

Another option is to spawn a short-lived `tokio::task` for each IO
future.
We implemented and benchmarked it during development, but found little
throughput improvement and moderate mean & tail latency degradation.
Concerns about pressure on the tokio scheduler made us discard this
variant.

The sidecar task could be obsoleted if the IOs were not arbitrary code
but a well-defined struct.
However,
1. the opaque futures approach taken in this PR allows leaving the
existing
   code unchanged, which
2. allows us to implement the `IoConcurrency::Sequential` mode for
feature-gating
   the change.

Once the new mode sidecar task implementation is rolled out everywhere,
and `::Sequential` removed, we can think about a descriptive submission
& completion interface.
The problems around deadlocks pointed out earlier will need to be solved
then.
For example, we could eliminate VirtualFile file descriptor cache and
tokio-epoll-uring slots.
The latter has been drafted in
https://github.com/neondatabase/tokio-epoll-uring/pull/63.

See the lengthy doc comment on `spawn_io()` for more details.

### Error handling

There are two error classes during reconstruct data retrieval:
* traversal errors: index lookup, move to next layer, and the like
* value read IO errors

A traversal error fails the entire get_vectored request, as before this
PR.
A value read error only fails that value.

In any case, we preserve the existing behavior that once
`get_vectored` returns, all IOs are done. Panics and failing
to poll `get_vectored` to completion will leave the IOs dangling,
which is safe but shouldn't happen, and so, a rate-limited
log statement will be emitted at warning level.
There is a doc comment on `collect_pending_ios` giving more code-level
details and rationale.

### Feature Gating

The new behavior is opt-in via pageserver config.
The `Sequential` mode is the default.
The only significant change in `Sequential` mode compared to before
this PR is the buffering of results in the `oneshot`s.

## Code-Level Changes

Prep work:
  * Make `GateGuard` clonable.

Core Feature:
* Traversal code: track  `will_init` in `BlobMeta` and source it from
the Delta/Image/InMemory layer index, instead of determining `will_init`
  after we've read the value. This avoids having to read the value to
  determine whether traversal can stop.
* Introduce `IoConcurrency` & its sidecar task.
  * `IoConcurrency` is the clonable handle.
  * It connects to the sidecar task via an `mpsc`.
* Plumb through `IoConcurrency` from high level code to the
  individual layer implementations' `get_values_reconstruct_data`.
  We piggy-back on the `ValuesReconstructState` for this.
   * The sidecar task should be long-lived, so, `IoConcurrency` needs
     to be rooted up "high" in the call stack.
   * Roots as of this PR:
     * `page_service`: outside of pagestream loop
     * `create_image_layers`: when it is called
     * `basebackup`(only auxfiles + replorigin + SLRU segments)
   * Code with no roots that uses `IoConcurrency::sequential`
     * any `Timeline::get` call
       * `collect_keyspace` is a good example
       * follow-up: https://github.com/neondatabase/neon/issues/10460
* `TimelineAdaptor` code used by the compaction simulator, unused in
practive
     * `ingest_xlog_dbase_create`
* Transform Delta/Image/InMemoryLayer to
  * do their values IO in a distinct `async {}` block
  * extend the residence of the Delta/Image layer until the IO is done
  * buffer their results in a `oneshot` channel instead of straight
    in `ValuesReconstructState`
* the `oneshot` channel is wrapped in `OnDiskValueIo` /
`OnDiskValueIoWaiter`
    types that aid in expressiveness and are used to keep track of
    in-flight IOs so we can print warnings if we leave them dangling.
* Change `ValuesReconstructState` to hold the receiving end of the
 `oneshot` channel aka `OnDiskValueIoWaiter`.
* Change `get_vectored_impl` to `collect_pending_ios` and issue walredo
concurrently, in a `FuturesUnordered`.

Testing / Benchmarking:
* Support queue-depth in pagebench for manual benchmarkinng.
* Add test suite support for setting concurrency mode ps config
   field via a) an env var and b) via NeonEnvBuilder.
* Hacky helper to have sidecar-based IoConcurrency in tests.
   This will be cleaned up later.

More benchmarking will happen post-merge in nightly benchmarks, plus in
staging/pre-prod.
Some intermediate helpers for manual benchmarking have been preserved in
https://github.com/neondatabase/neon/pull/10466 and will be landed in
later PRs.
(L0 layer stack generator!)

Drive-By:
* test suite actually didn't enable batching by default because
`config.compatibility_neon_binpath` is always Truthy in our CI
environment
  => https://neondb.slack.com/archives/C059ZC138NR/p1737490501941309
* initial logical size calculation wasn't always polled to completion,
which was
  surfaced through the added WARN logs emitted when dropping a
  `ValuesReconstructState` that still has inflight IOs.
* remove the timing histograms
`pageserver_getpage_get_reconstruct_data_seconds`
and `pageserver_getpage_reconstruct_seconds` because with planning,
value read
IO, and walredo happening concurrently, one can no longer attribute
latency
to any one of them; we'll revisit this when Vlad's work on
tracing/sampling
  through RequestContext lands.
* remove code related to `get_cached_lsn()`.
  The logic around this has been dead at runtime for a long time,
  ever since the removal of the materialized page cache in #8105.

## Testing

Unit tests use the sidecar task by default and run both modes in CI.
Python regression tests and benchmarks also use the sidecar task by
default.
We'll test more in staging and possibly preprod.

# Future Work

Please refer to the parent epic for the full plan.

The next step will be to fold the plumbing of IoConcurrency
into RequestContext so that the function signatures get cleaned up.

Once `Sequential` isn't used anymore, we can take the next
big leap which is replacing the opaque IOs with structs
that have well-defined semantics.

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 .github/workflows/_build-and-test-locally.yml |  10 +-
 Cargo.lock                                    |   4 +-
 libs/pageserver_api/src/config.rs             |  25 +
 libs/utils/src/env.rs                         |  26 +-
 libs/utils/src/sync/gate.rs                   |  53 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |  61 +-
 pageserver/src/basebackup.rs                  |  23 +-
 pageserver/src/bin/pageserver.rs              |   1 +
 pageserver/src/config.rs                      |   4 +
 pageserver/src/http/routes.rs                 |  10 +-
 pageserver/src/metrics.rs                     |  68 --
 pageserver/src/page_service.rs                |  63 +-
 pageserver/src/pgdatadir_mapping.rs           |  45 +-
 pageserver/src/tenant.rs                      |  68 +-
 pageserver/src/tenant/storage_layer.rs        | 713 +++++++++++++++---
 .../src/tenant/storage_layer/delta_layer.rs   | 179 ++---
 .../src/tenant/storage_layer/image_layer.rs   | 112 +--
 .../tenant/storage_layer/inmemory_layer.rs    | 106 ++-
 pageserver/src/tenant/storage_layer/layer.rs  |  18 +-
 .../src/tenant/storage_layer/layer/tests.rs   |  19 +-
 pageserver/src/tenant/timeline.rs             | 133 ++--
 pageserver/src/tenant/timeline/compaction.rs  |   3 +-
 pageserver/src/tenant/vectored_blob_io.rs     | 103 ++-
 pageserver/src/walingest.rs                   | 155 +++-
 test_runner/fixtures/common_types.py          |   4 +
 test_runner/fixtures/metrics.py               |   4 -
 test_runner/fixtures/neon_fixtures.py         |  29 +
 test_runner/fixtures/parametrize.py           |   5 +
 test_runner/regress/test_compatibility.py     |   2 +
 29 files changed, 1490 insertions(+), 556 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 4263bacce8..2daed90386 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -229,8 +229,13 @@ jobs:
           ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'
 
           # run pageserver tests with different settings
-          for io_engine in std-fs tokio-epoll-uring ; do
-            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+          for get_vectored_concurrent_io in sequential sidecar-task; do
+            for io_engine in std-fs tokio-epoll-uring ; do
+              NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
+                NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
+                ${cov_prefix} \
+                cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+            done
           done
 
           # Run separate tests for real S3
@@ -314,6 +319,7 @@ jobs:
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
           BUILD_TAG: ${{ inputs.build-tag }}
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
           USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
 
       # Temporary disable this step until we figure out why it's so flaky
diff --git a/Cargo.lock b/Cargo.lock
index 2020c417f0..1f090a27e4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6774,7 +6774,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497"
 dependencies = [
  "futures",
  "nix 0.26.4",
@@ -7369,7 +7369,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497"
 dependencies = [
  "bytes",
  "io-uring",
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index f0aeb00736..4982c6233d 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -120,6 +120,7 @@ pub struct ConfigToml {
     pub no_sync: Option<bool>,
     pub wal_receiver_protocol: PostgresClientProtocol,
     pub page_service_pipelining: PageServicePipeliningConfig,
+    pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -158,6 +159,25 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy {
     Tasks,
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(tag = "mode", rename_all = "kebab-case")]
+#[serde(deny_unknown_fields)]
+pub enum GetVectoredConcurrentIo {
+    /// The read path is fully sequential: layers are visited
+    /// one after the other and IOs are issued and waited upon
+    /// from the same task that traverses the layers.
+    Sequential,
+    /// The read path still traverses layers sequentially, and
+    /// index blocks will be read into the PS PageCache from
+    /// that task, with waiting.
+    /// But data IOs are dispatched and waited upon from a sidecar
+    /// task so that the traversing task can continue to traverse
+    /// layers while the IOs are in flight.
+    /// If the PS PageCache miss rate is low, this improves
+    /// throughput dramatically.
+    SidecarTask,
+}
+
 pub mod statvfs {
     pub mod mock {
         #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -464,6 +484,11 @@ impl Default for ConfigToml {
                     execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
                 })
             },
+            get_vectored_concurrent_io: if !cfg!(test) {
+                GetVectoredConcurrentIo::Sequential
+            } else {
+                GetVectoredConcurrentIo::SidecarTask
+            },
         }
     }
 }
diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs
index b3e326bfd0..a1bcec9229 100644
--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -2,6 +2,7 @@
 
 use std::{fmt::Display, str::FromStr};
 
+/// For types `V` that implement [`FromStr`].
 pub fn var<V, E>(varname: &str) -> Option<V>
 where
     V: FromStr<Err = E>,
@@ -10,7 +11,9 @@ where
     match std::env::var(varname) {
         Ok(s) => Some(
             s.parse()
-                .map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
+                .map_err(|e| {
+                    format!("failed to parse env var {varname} using FromStr::parse: {e:#}")
+                })
                 .unwrap(),
         ),
         Err(std::env::VarError::NotPresent) => None,
@@ -19,3 +22,24 @@ where
         }
     }
 }
+
+/// For types `V` that implement [`serde::de::DeserializeOwned`].
+pub fn var_serde_json_string<V>(varname: &str) -> Option<V>
+where
+    V: serde::de::DeserializeOwned,
+{
+    match std::env::var(varname) {
+        Ok(s) => Some({
+            let value = serde_json::Value::String(s);
+            serde_json::from_value(value)
+                .map_err(|e| {
+                    format!("failed to parse env var {varname} as a serde_json json string: {e:#}")
+                })
+                .unwrap()
+        }),
+        Err(std::env::VarError::NotPresent) => None,
+        Err(std::env::VarError::NotUnicode(_)) => {
+            panic!("env var {varname} is not unicode")
+        }
+    }
+}
diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs
index 16ec563fa7..0a1ed81621 100644
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -64,6 +64,12 @@ pub struct GateGuard {
     gate: Arc<GateInner>,
 }
 
+impl GateGuard {
+    pub fn try_clone(&self) -> Result<Self, GateError> {
+        Gate::enter_impl(self.gate.clone())
+    }
+}
+
 impl Drop for GateGuard {
     fn drop(&mut self) {
         if self.gate.closing.load(Ordering::Relaxed) {
@@ -107,11 +113,11 @@ impl Gate {
     /// to avoid blocking close() indefinitely: typically types that contain a Gate will
     /// also contain a CancellationToken.
     pub fn enter(&self) -> Result<GateGuard, GateError> {
-        let permit = self
-            .inner
-            .sem
-            .try_acquire()
-            .map_err(|_| GateError::GateClosed)?;
+        Self::enter_impl(self.inner.clone())
+    }
+
+    fn enter_impl(gate: Arc<GateInner>) -> Result<GateGuard, GateError> {
+        let permit = gate.sem.try_acquire().map_err(|_| GateError::GateClosed)?;
 
         // we now have the permit, let's disable the normal raii functionality and leave
         // "returning" the permit to our GateGuard::drop.
@@ -122,7 +128,7 @@ impl Gate {
 
         Ok(GateGuard {
             span_at_enter: tracing::Span::current(),
-            gate: self.inner.clone(),
+            gate,
         })
     }
 
@@ -252,4 +258,39 @@ mod tests {
         // Attempting to enter() is still forbidden
         gate.enter().expect_err("enter should fail finishing close");
     }
+
+    #[tokio::test(start_paused = true)]
+    async fn clone_gate_guard() {
+        let gate = Gate::default();
+        let forever = Duration::from_secs(24 * 7 * 365);
+
+        let guard1 = gate.enter().expect("gate isn't closed");
+
+        let guard2 = guard1.try_clone().expect("gate isn't clsoed");
+
+        let mut close_fut = std::pin::pin!(gate.close());
+
+        tokio::time::timeout(forever, &mut close_fut)
+            .await
+            .unwrap_err();
+
+        // we polled close_fut once, that should prevent all later enters and clones
+        gate.enter().unwrap_err();
+        guard1.try_clone().unwrap_err();
+        guard2.try_clone().unwrap_err();
+
+        // guard2 keeps gate open even if guard1 is closed
+        drop(guard1);
+        tokio::time::timeout(forever, &mut close_fut)
+            .await
+            .unwrap_err();
+
+        drop(guard2);
+
+        // now that the last guard is dropped, closing should complete
+        close_fut.await;
+
+        // entering is still forbidden
+        gate.enter().expect_err("enter should stilll fail");
+    }
 }
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 9f3984f1bd..a60efc7567 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -13,7 +13,7 @@ use rand::prelude::*;
 use tokio::task::JoinSet;
 use tracing::info;
 
-use std::collections::HashSet;
+use std::collections::{HashSet, VecDeque};
 use std::future::Future;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
@@ -63,6 +63,10 @@ pub(crate) struct Args {
     #[clap(long)]
     set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
 
+    /// Queue depth generated in each client.
+    #[clap(long, default_value = "1")]
+    queue_depth: NonZeroUsize,
+
     targets: Option<Vec<TenantTimelineId>>,
 }
 
@@ -298,6 +302,7 @@ async fn main_impl(
             start_work_barrier.wait().await;
             let client_start = Instant::now();
             let mut ticks_processed = 0;
+            let mut inflight = VecDeque::new();
             while !cancel.is_cancelled() {
                 // Detect if a request took longer than the RPS rate
                 if let Some(period) = &rps_period {
@@ -311,31 +316,37 @@ async fn main_impl(
                     ticks_processed = periods_passed_until_now;
                 }
 
-                let start = Instant::now();
-                let req = {
-                    let mut rng = rand::thread_rng();
-                    let r = &ranges[weights.sample(&mut rng)];
-                    let key: i128 = rng.gen_range(r.start..r.end);
-                    let key = Key::from_i128(key);
-                    assert!(key.is_rel_block_key());
-                    let (rel_tag, block_no) = key
-                        .to_rel_block()
-                        .expect("we filter non-rel-block keys out above");
-                    PagestreamGetPageRequest {
-                        hdr: PagestreamRequest {
-                            reqid: 0,
-                            request_lsn: if rng.gen_bool(args.req_latest_probability) {
-                                Lsn::MAX
-                            } else {
-                                r.timeline_lsn
+                while inflight.len() < args.queue_depth.get() {
+                    let start = Instant::now();
+                    let req = {
+                        let mut rng = rand::thread_rng();
+                        let r = &ranges[weights.sample(&mut rng)];
+                        let key: i128 = rng.gen_range(r.start..r.end);
+                        let key = Key::from_i128(key);
+                        assert!(key.is_rel_block_key());
+                        let (rel_tag, block_no) = key
+                            .to_rel_block()
+                            .expect("we filter non-rel-block keys out above");
+                        PagestreamGetPageRequest {
+                            hdr: PagestreamRequest {
+                                reqid: 0,
+                                request_lsn: if rng.gen_bool(args.req_latest_probability) {
+                                    Lsn::MAX
+                                } else {
+                                    r.timeline_lsn
+                                },
+                                not_modified_since: r.timeline_lsn,
                             },
-                            not_modified_since: r.timeline_lsn,
-                        },
-                        rel: rel_tag,
-                        blkno: block_no,
-                    }
-                };
-                client.getpage(req).await.unwrap();
+                            rel: rel_tag,
+                            blkno: block_no,
+                        }
+                    };
+                    client.getpage_send(req).await.unwrap();
+                    inflight.push_back(start);
+                }
+
+                let start = inflight.pop_front().unwrap();
+                client.getpage_recv().await.unwrap();
                 let end = Instant::now();
                 live_stats.request_done();
                 ticks_processed += 1;
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index e1b5676f46..a6087920fd 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -25,6 +25,7 @@ use tokio_tar::{Builder, EntryType, Header};
 
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::Version;
+use crate::tenant::storage_layer::IoConcurrency;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};
 
@@ -123,6 +124,13 @@ where
         full_backup,
         replica,
         ctx,
+        io_concurrency: IoConcurrency::spawn_from_conf(
+            timeline.conf,
+            timeline
+                .gate
+                .enter()
+                .map_err(|e| BasebackupError::Server(e.into()))?,
+        ),
     };
     basebackup
         .send_tarball()
@@ -144,6 +152,7 @@ where
     full_backup: bool,
     replica: bool,
     ctx: &'a RequestContext,
+    io_concurrency: IoConcurrency,
 }
 
 /// A sink that accepts SLRU blocks ordered by key and forwards
@@ -303,7 +312,7 @@ where
             for part in slru_partitions.parts {
                 let blocks = self
                     .timeline
-                    .get_vectored(part, self.lsn, self.ctx)
+                    .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
                     .await
                     .map_err(|e| BasebackupError::Server(e.into()))?;
 
@@ -358,7 +367,7 @@ where
         let start_time = Instant::now();
         let aux_files = self
             .timeline
-            .list_aux_files(self.lsn, self.ctx)
+            .list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone())
             .await
             .map_err(|e| BasebackupError::Server(e.into()))?;
         let aux_scan_time = start_time.elapsed();
@@ -422,7 +431,7 @@ where
         }
         let repl_origins = self
             .timeline
-            .get_replorigins(self.lsn, self.ctx)
+            .get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone())
             .await
             .map_err(|e| BasebackupError::Server(e.into()))?;
         let n_origins = repl_origins.len();
@@ -489,7 +498,13 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
+                    .get_rel_page_at_lsn(
+                        src,
+                        blknum,
+                        Version::Lsn(self.lsn),
+                        self.ctx,
+                        self.io_concurrency.clone(),
+                    )
                     .await
                     .map_err(|e| BasebackupError::Server(e.into()))?;
                 segment_data.extend_from_slice(&img[..]);
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 921c6a5092..5764728505 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -135,6 +135,7 @@ fn main() -> anyhow::Result<()> {
     info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
     info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
     info!(?conf.page_service_pipelining, "starting with page service pipelining config");
+    info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config");
 
     // The tenants directory contains all the pageserver local disk state.
     // Create if not exists and make sure all the contents are durable before proceeding.
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 1651db8500..ce480c70a0 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -191,6 +191,8 @@ pub struct PageServerConf {
     pub wal_receiver_protocol: PostgresClientProtocol,
 
     pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,
+
+    pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo,
 }
 
 /// Token for authentication to safekeepers
@@ -352,6 +354,7 @@ impl PageServerConf {
             no_sync,
             wal_receiver_protocol,
             page_service_pipelining,
+            get_vectored_concurrent_io,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -396,6 +399,7 @@ impl PageServerConf {
             import_pgdata_aws_endpoint_url,
             wal_receiver_protocol,
             page_service_pipelining,
+            get_vectored_concurrent_io,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 33b2d04588..5452719bcd 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -84,6 +84,7 @@ use crate::tenant::remote_timeline_client::list_remote_tenant_shards;
 use crate::tenant::remote_timeline_client::list_remote_timelines;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
+use crate::tenant::storage_layer::IoConcurrency;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::import_pgdata;
@@ -2938,8 +2939,15 @@ async fn list_aux_files(
         active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
             .await?;
 
+    let io_concurrency = IoConcurrency::spawn_from_conf(
+        state.conf,
+        timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
+    );
+
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let files = timeline.list_aux_files(body.lsn, &ctx).await?;
+    let files = timeline
+        .list_aux_files(body.lsn, &ctx, io_concurrency)
+        .await?;
     json_response(StatusCode::OK, files)
 }
 
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 252e566f70..02467cb6f7 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -126,73 +126,6 @@ pub(crate) static INITDB_RUN_TIME: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define metric")
 });
 
-// Metrics collected on operations on the storage repository.
-#[derive(
-    Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
-)]
-pub(crate) enum GetKind {
-    Singular,
-    Vectored,
-}
-
-pub(crate) struct ReconstructTimeMetrics {
-    singular: Histogram,
-    vectored: Histogram,
-}
-
-pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
-    let inner = register_histogram_vec!(
-        "pageserver_getpage_reconstruct_seconds",
-        "Time spent in reconstruct_value (reconstruct a page from deltas)",
-        &["get_kind"],
-        CRITICAL_OP_BUCKETS.into(),
-    )
-    .expect("failed to define a metric");
-
-    ReconstructTimeMetrics {
-        singular: inner.with_label_values(&[GetKind::Singular.into()]),
-        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
-    }
-});
-
-impl ReconstructTimeMetrics {
-    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
-        match get_kind {
-            GetKind::Singular => &self.singular,
-            GetKind::Vectored => &self.vectored,
-        }
-    }
-}
-
-pub(crate) struct ReconstructDataTimeMetrics {
-    singular: Histogram,
-    vectored: Histogram,
-}
-
-impl ReconstructDataTimeMetrics {
-    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
-        match get_kind {
-            GetKind::Singular => &self.singular,
-            GetKind::Vectored => &self.vectored,
-        }
-    }
-}
-
-pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> = Lazy::new(|| {
-    let inner = register_histogram_vec!(
-        "pageserver_getpage_get_reconstruct_data_seconds",
-        "Time spent in get_reconstruct_value_data",
-        &["get_kind"],
-        CRITICAL_OP_BUCKETS.into(),
-    )
-    .expect("failed to define a metric");
-
-    ReconstructDataTimeMetrics {
-        singular: inner.with_label_values(&[GetKind::Singular.into()]),
-        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
-    }
-});
-
 pub(crate) struct GetVectoredLatency {
     map: EnumMap<TaskKind, Option<Histogram>>,
 }
@@ -3934,7 +3867,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
     });
 
     // Custom
-    Lazy::force(&RECONSTRUCT_TIME);
     Lazy::force(&BASEBACKUP_QUERY_TIME);
     Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
     Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE);
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index b14a44f9e3..e5063b7fc2 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -39,6 +39,7 @@ use tokio::io::{AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::sync::gate::{Gate, GateGuard};
 use utils::sync::spsc_fold;
 use utils::{
     auth::{Claims, Scope, SwappableJwtAuth},
@@ -61,6 +62,7 @@ use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME};
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
 use crate::tenant::mgr::{GetActiveTenantError, GetTenantError, ShardResolveResult};
+use crate::tenant::storage_layer::IoConcurrency;
 use crate::tenant::timeline::{self, WaitLsnError};
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
@@ -90,6 +92,7 @@ pub struct Listener {
 pub struct Connections {
     cancel: CancellationToken,
     tasks: tokio::task::JoinSet<ConnectionHandlerResult>,
+    gate: Gate,
 }
 
 pub fn spawn(
@@ -110,6 +113,7 @@ pub fn spawn(
     let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
         "libpq listener",
         libpq_listener_main(
+            conf,
             tenant_manager,
             pg_auth,
             tcp_listener,
@@ -134,11 +138,16 @@ impl Listener {
 }
 impl Connections {
     pub(crate) async fn shutdown(self) {
-        let Self { cancel, mut tasks } = self;
+        let Self {
+            cancel,
+            mut tasks,
+            gate,
+        } = self;
         cancel.cancel();
         while let Some(res) = tasks.join_next().await {
             Self::handle_connection_completion(res);
         }
+        gate.close().await;
     }
 
     fn handle_connection_completion(res: Result<anyhow::Result<()>, tokio::task::JoinError>) {
@@ -158,7 +167,9 @@ impl Connections {
 /// Returns Ok(()) upon cancellation via `cancel`, returning the set of
 /// open connections.
 ///
+#[allow(clippy::too_many_arguments)]
 pub async fn libpq_listener_main(
+    conf: &'static PageServerConf,
     tenant_manager: Arc<TenantManager>,
     auth: Option<Arc<SwappableJwtAuth>>,
     listener: tokio::net::TcpListener,
@@ -168,9 +179,15 @@ pub async fn libpq_listener_main(
     listener_cancel: CancellationToken,
 ) -> Connections {
     let connections_cancel = CancellationToken::new();
+    let connections_gate = Gate::default();
     let mut connection_handler_tasks = tokio::task::JoinSet::default();
 
     loop {
+        let gate_guard = match connections_gate.enter() {
+            Ok(guard) => guard,
+            Err(_) => break,
+        };
+
         let accepted = tokio::select! {
             biased;
             _ = listener_cancel.cancelled() => break,
@@ -190,6 +207,7 @@ pub async fn libpq_listener_main(
                 let connection_ctx = listener_ctx
                     .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download);
                 connection_handler_tasks.spawn(page_service_conn_main(
+                    conf,
                     tenant_manager.clone(),
                     local_auth,
                     socket,
@@ -197,6 +215,7 @@ pub async fn libpq_listener_main(
                     pipelining_config.clone(),
                     connection_ctx,
                     connections_cancel.child_token(),
+                    gate_guard,
                 ));
             }
             Err(err) => {
@@ -211,13 +230,16 @@ pub async fn libpq_listener_main(
     Connections {
         cancel: connections_cancel,
         tasks: connection_handler_tasks,
+        gate: connections_gate,
     }
 }
 
 type ConnectionHandlerResult = anyhow::Result<()>;
 
 #[instrument(skip_all, fields(peer_addr))]
+#[allow(clippy::too_many_arguments)]
 async fn page_service_conn_main(
+    conf: &'static PageServerConf,
     tenant_manager: Arc<TenantManager>,
     auth: Option<Arc<SwappableJwtAuth>>,
     socket: tokio::net::TcpStream,
@@ -225,6 +247,7 @@ async fn page_service_conn_main(
     pipelining_config: PageServicePipeliningConfig,
     connection_ctx: RequestContext,
     cancel: CancellationToken,
+    gate_guard: GateGuard,
 ) -> ConnectionHandlerResult {
     let _guard = LIVE_CONNECTIONS
         .with_label_values(&["page_service"])
@@ -274,11 +297,13 @@ async fn page_service_conn_main(
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
     // and create the per-query context in process_query ourselves.
     let mut conn_handler = PageServerHandler::new(
+        conf,
         tenant_manager,
         auth,
         pipelining_config,
         connection_ctx,
         cancel.clone(),
+        gate_guard,
     );
     let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
@@ -310,6 +335,7 @@ async fn page_service_conn_main(
 }
 
 struct PageServerHandler {
+    conf: &'static PageServerConf,
     auth: Option<Arc<SwappableJwtAuth>>,
     claims: Option<Claims>,
 
@@ -325,6 +351,8 @@ struct PageServerHandler {
     timeline_handles: Option<TimelineHandles>,
 
     pipelining_config: PageServicePipeliningConfig,
+
+    gate_guard: GateGuard,
 }
 
 struct TimelineHandles {
@@ -634,19 +662,23 @@ impl BatchedFeMessage {
 
 impl PageServerHandler {
     pub fn new(
+        conf: &'static PageServerConf,
         tenant_manager: Arc<TenantManager>,
         auth: Option<Arc<SwappableJwtAuth>>,
         pipelining_config: PageServicePipeliningConfig,
         connection_ctx: RequestContext,
         cancel: CancellationToken,
+        gate_guard: GateGuard,
     ) -> Self {
         PageServerHandler {
+            conf,
             auth,
             claims: None,
             connection_ctx,
             timeline_handles: Some(TimelineHandles::new(tenant_manager)),
             cancel,
             pipelining_config,
+            gate_guard,
         }
     }
 
@@ -1015,6 +1047,7 @@ impl PageServerHandler {
         &mut self,
         pgb_writer: &mut PostgresBackend<IO>,
         batch: BatchedFeMessage,
+        io_concurrency: IoConcurrency,
         cancel: &CancellationToken,
         protocol_version: PagestreamProtocolVersion,
         ctx: &RequestContext,
@@ -1084,6 +1117,7 @@ impl PageServerHandler {
                                 &*shard.upgrade()?,
                                 effective_request_lsn,
                                 pages,
+                                io_concurrency,
                                 ctx,
                             )
                             .instrument(span.clone())
@@ -1288,6 +1322,17 @@ impl PageServerHandler {
             }
         }
 
+        let io_concurrency = IoConcurrency::spawn_from_conf(
+            self.conf,
+            match self.gate_guard.try_clone() {
+                Ok(guard) => guard,
+                Err(_) => {
+                    info!("shutdown request received in page handler");
+                    return Err(QueryError::Shutdown);
+                }
+            },
+        );
+
         let pgb_reader = pgb
             .split()
             .context("implementation error: split pgb into reader and writer")?;
@@ -1309,6 +1354,7 @@ impl PageServerHandler {
                     request_span,
                     pipelining_config,
                     protocol_version,
+                    io_concurrency,
                     &ctx,
                 )
                 .await
@@ -1322,6 +1368,7 @@ impl PageServerHandler {
                     timeline_handles,
                     request_span,
                     protocol_version,
+                    io_concurrency,
                     &ctx,
                 )
                 .await
@@ -1349,6 +1396,7 @@ impl PageServerHandler {
         mut timeline_handles: TimelineHandles,
         request_span: Span,
         protocol_version: PagestreamProtocolVersion,
+        io_concurrency: IoConcurrency,
         ctx: &RequestContext,
     ) -> (
         (PostgresBackendReader<IO>, TimelineHandles),
@@ -1383,7 +1431,14 @@ impl PageServerHandler {
             };
 
             let err = self
-                .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx)
+                .pagesteam_handle_batched_message(
+                    pgb_writer,
+                    msg,
+                    io_concurrency.clone(),
+                    &cancel,
+                    protocol_version,
+                    ctx,
+                )
                 .await;
             match err {
                 Ok(()) => {}
@@ -1407,6 +1462,7 @@ impl PageServerHandler {
         request_span: Span,
         pipelining_config: PageServicePipeliningConfigPipelined,
         protocol_version: PagestreamProtocolVersion,
+        io_concurrency: IoConcurrency,
         ctx: &RequestContext,
     ) -> (
         (PostgresBackendReader<IO>, TimelineHandles),
@@ -1550,6 +1606,7 @@ impl PageServerHandler {
                     self.pagesteam_handle_batched_message(
                         pgb_writer,
                         batch,
+                        io_concurrency.clone(),
                         &cancel,
                         protocol_version,
                         &ctx,
@@ -1806,6 +1863,7 @@ impl PageServerHandler {
         timeline: &Timeline,
         effective_lsn: Lsn,
         requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
+        io_concurrency: IoConcurrency,
         ctx: &RequestContext,
     ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
         debug_assert_current_span_has_tenant_and_timeline_id();
@@ -1832,6 +1890,7 @@ impl PageServerHandler {
             .get_rel_page_at_lsn_batched(
                 requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
                 effective_lsn,
+                io_concurrency,
                 ctx,
             )
             .await;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index b65fe6cf7c..40c657524d 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -17,6 +17,7 @@ use crate::span::{
     debug_assert_current_span_has_tenant_and_timeline_id,
     debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
 };
+use crate::tenant::storage_layer::IoConcurrency;
 use crate::tenant::timeline::GetVectoredError;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
@@ -200,6 +201,7 @@ impl Timeline {
         blknum: BlockNumber,
         version: Version<'_>,
         ctx: &RequestContext,
+        io_concurrency: IoConcurrency,
     ) -> Result<Bytes, PageReconstructError> {
         match version {
             Version::Lsn(effective_lsn) => {
@@ -208,6 +210,7 @@ impl Timeline {
                     .get_rel_page_at_lsn_batched(
                         pages.iter().map(|(tag, blknum)| (tag, blknum)),
                         effective_lsn,
+                        io_concurrency.clone(),
                         ctx,
                     )
                     .await;
@@ -246,6 +249,7 @@ impl Timeline {
         &self,
         pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber)>,
         effective_lsn: Lsn,
+        io_concurrency: IoConcurrency,
         ctx: &RequestContext,
     ) -> Vec<Result<Bytes, PageReconstructError>> {
         debug_assert_current_span_has_tenant_and_timeline_id();
@@ -309,7 +313,10 @@ impl Timeline {
             acc.to_keyspace()
         };
 
-        match self.get_vectored(keyspace, effective_lsn, ctx).await {
+        match self
+            .get_vectored(keyspace, effective_lsn, io_concurrency, ctx)
+            .await
+        {
             Ok(results) => {
                 for (key, res) in results {
                     let mut key_slots = keys_slots.remove(&key).unwrap().into_iter();
@@ -889,9 +896,15 @@ impl Timeline {
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
+        io_concurrency: IoConcurrency,
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
         let kv = self
-            .scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
+            .scan(
+                KeySpace::single(Key::metadata_aux_key_range()),
+                lsn,
+                ctx,
+                io_concurrency,
+            )
             .await?;
         let mut result = HashMap::new();
         let mut sz = 0;
@@ -914,8 +927,9 @@ impl Timeline {
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
+        io_concurrency: IoConcurrency,
     ) -> Result<(), PageReconstructError> {
-        self.list_aux_files_v2(lsn, ctx).await?;
+        self.list_aux_files_v2(lsn, ctx, io_concurrency).await?;
         Ok(())
     }
 
@@ -923,17 +937,24 @@ impl Timeline {
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
+        io_concurrency: IoConcurrency,
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        self.list_aux_files_v2(lsn, ctx).await
+        self.list_aux_files_v2(lsn, ctx, io_concurrency).await
     }
 
     pub(crate) async fn get_replorigins(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
+        io_concurrency: IoConcurrency,
     ) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
         let kv = self
-            .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
+            .scan(
+                KeySpace::single(repl_origin_key_range()),
+                lsn,
+                ctx,
+                io_concurrency,
+            )
             .await?;
         let mut result = HashMap::new();
         for (k, v) in kv {
@@ -2432,7 +2453,11 @@ mod tests {
             ("foo/bar2".to_string(), Bytes::from_static(b"content2")),
         ]);
 
-        let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
+        let io_concurrency = IoConcurrency::spawn_for_test();
+
+        let readback = tline
+            .list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone())
+            .await?;
         assert_eq!(readback, expect_1008);
 
         // Second modification: update one key, remove the other
@@ -2444,11 +2469,15 @@ mod tests {
         let expect_2008 =
             HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]);
 
-        let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?;
+        let readback = tline
+            .list_aux_files(Lsn(0x2008), &ctx, io_concurrency.clone())
+            .await?;
         assert_eq!(readback, expect_2008);
 
         // Reading back in time works
-        let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
+        let readback = tline
+            .list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone())
+            .await?;
         assert_eq!(readback, expect_1008);
 
         Ok(())
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e45ba2ca3b..a273ef5d01 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5714,7 +5714,7 @@ mod tests {
     use pageserver_api::value::Value;
     use pageserver_compaction::helpers::overlaps_with;
     use rand::{thread_rng, Rng};
-    use storage_layer::PersistentLayerKey;
+    use storage_layer::{IoConcurrency, PersistentLayerKey};
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
     use timeline::{CompactOptions, DeltaLayerTestDesc};
@@ -6495,6 +6495,7 @@ mod tests {
     async fn test_get_vectored() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_get_vectored").await?;
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -6559,7 +6560,7 @@ mod tests {
                 .get_vectored_impl(
                     read.clone(),
                     reads_lsn,
-                    &mut ValuesReconstructState::new(),
+                    &mut ValuesReconstructState::new(io_concurrency.clone()),
                     &ctx,
                 )
                 .await;
@@ -6606,6 +6607,7 @@ mod tests {
         let harness = TenantHarness::create("test_get_vectored_aux_files").await?;
 
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -6640,7 +6642,7 @@ mod tests {
             .get_vectored_impl(
                 aux_keyspace.clone(),
                 read_lsn,
-                &mut ValuesReconstructState::new(),
+                &mut ValuesReconstructState::new(io_concurrency.clone()),
                 &ctx,
             )
             .await;
@@ -6688,6 +6690,7 @@ mod tests {
         )
         .await?;
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
 
         let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
         let gap_at_key = current_key.add(100);
@@ -6788,7 +6791,7 @@ mod tests {
             .get_vectored_impl(
                 read.clone(),
                 current_lsn,
-                &mut ValuesReconstructState::new(),
+                &mut ValuesReconstructState::new(io_concurrency.clone()),
                 &ctx,
             )
             .await?;
@@ -6831,6 +6834,7 @@ mod tests {
     async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?;
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
 
         let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
         let end_key = start_key.add(1000);
@@ -6923,7 +6927,7 @@ mod tests {
                         ranges: vec![child_gap_at_key..child_gap_at_key.next()],
                     },
                     query_lsn,
-                    &mut ValuesReconstructState::new(),
+                    &mut ValuesReconstructState::new(io_concurrency.clone()),
                     &ctx,
                 )
                 .await;
@@ -7369,6 +7373,7 @@ mod tests {
     async fn test_metadata_scan() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_metadata_scan").await?;
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -7422,7 +7427,7 @@ mod tests {
                 .get_vectored_impl(
                     keyspace.clone(),
                     lsn,
-                    &mut ValuesReconstructState::default(),
+                    &mut ValuesReconstructState::new(io_concurrency.clone()),
                     &ctx,
                 )
                 .await?
@@ -7537,6 +7542,7 @@ mod tests {
         let harness = TenantHarness::create("test_aux_file_e2e").await.unwrap();
 
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
 
         let mut lsn = Lsn(0x08);
 
@@ -7556,7 +7562,10 @@ mod tests {
         }
 
         // we can read everything from the storage
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        let files = tline
+            .list_aux_files(lsn, &ctx, io_concurrency.clone())
+            .await
+            .unwrap();
         assert_eq!(
             files.get("pg_logical/mappings/test1"),
             Some(&bytes::Bytes::from_static(b"first"))
@@ -7572,7 +7581,10 @@ mod tests {
             modification.commit(&ctx).await.unwrap();
         }
 
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        let files = tline
+            .list_aux_files(lsn, &ctx, io_concurrency.clone())
+            .await
+            .unwrap();
         assert_eq!(
             files.get("pg_logical/mappings/test2"),
             Some(&bytes::Bytes::from_static(b"second"))
@@ -7583,7 +7595,10 @@ mod tests {
             .await
             .unwrap();
 
-        let files = child.list_aux_files(lsn, &ctx).await.unwrap();
+        let files = child
+            .list_aux_files(lsn, &ctx, io_concurrency.clone())
+            .await
+            .unwrap();
         assert_eq!(files.get("pg_logical/mappings/test1"), None);
         assert_eq!(files.get("pg_logical/mappings/test2"), None);
     }
@@ -7592,6 +7607,7 @@ mod tests {
     async fn test_metadata_image_creation() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_metadata_image_creation").await?;
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -7611,8 +7627,9 @@ mod tests {
             keyspace: &KeySpace,
             lsn: Lsn,
             ctx: &RequestContext,
+            io_concurrency: IoConcurrency,
         ) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
-            let mut reconstruct_state = ValuesReconstructState::default();
+            let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
             let res = tline
                 .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
                 .await?;
@@ -7660,7 +7677,8 @@ mod tests {
 
             if iter % 5 == 0 {
                 let (_, before_delta_file_accessed) =
-                    scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
+                    scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone())
+                        .await?;
                 tline
                     .compact(
                         &cancel,
@@ -7674,7 +7692,8 @@ mod tests {
                     )
                     .await?;
                 let (_, after_delta_file_accessed) =
-                    scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
+                    scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone())
+                        .await?;
                 assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}");
                 // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances.
                 assert!(
@@ -7763,6 +7782,7 @@ mod tests {
     async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?;
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
 
         let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
         let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -7901,7 +7921,7 @@ mod tests {
         );
 
         // test vectored scan on parent timeline
-        let mut reconstruct_state = ValuesReconstructState::new();
+        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
         let res = tline
             .get_vectored_impl(
                 KeySpace::single(Key::metadata_key_range()),
@@ -7927,7 +7947,7 @@ mod tests {
         );
 
         // test vectored scan on child timeline
-        let mut reconstruct_state = ValuesReconstructState::new();
+        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
         let res = child
             .get_vectored_impl(
                 KeySpace::single(Key::metadata_key_range()),
@@ -7965,7 +7985,9 @@ mod tests {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<Option<Bytes>, GetVectoredError> {
-        let mut reconstruct_state = ValuesReconstructState::new();
+        let io_concurrency =
+            IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
+        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
         let mut res = tline
             .get_vectored_impl(
                 KeySpace::single(key..key.next()),
@@ -8066,6 +8088,7 @@ mod tests {
             .await
             .unwrap();
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
 
         let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
         let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -8125,7 +8148,7 @@ mod tests {
 
         // Image layers are created at last_record_lsn
         let images = tline
-            .inspect_image_layers(Lsn(0x40), &ctx)
+            .inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone())
             .await
             .unwrap()
             .into_iter()
@@ -8140,6 +8163,7 @@ mod tests {
             .await
             .unwrap();
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
 
         let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
         let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
@@ -8190,7 +8214,7 @@ mod tests {
 
         // Image layers are created at last_record_lsn
         let images = tline
-            .inspect_image_layers(Lsn(0x30), &ctx)
+            .inspect_image_layers(Lsn(0x30), &ctx, io_concurrency.clone())
             .await
             .unwrap()
             .into_iter()
@@ -8203,6 +8227,7 @@ mod tests {
     async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?;
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
 
         fn get_key(id: u32) -> Key {
             // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
@@ -8344,7 +8369,7 @@ mod tests {
 
         // Check if the image layer at the GC horizon contains exactly what we want
         let image_at_gc_horizon = tline
-            .inspect_image_layers(Lsn(0x30), &ctx)
+            .inspect_image_layers(Lsn(0x30), &ctx, io_concurrency.clone())
             .await
             .unwrap()
             .into_iter()
@@ -10057,7 +10082,12 @@ mod tests {
 
         let keyspace = KeySpace::single(get_key(0)..get_key(10));
         let results = tline
-            .get_vectored(keyspace, delta_layer_end_lsn, &ctx)
+            .get_vectored(
+                keyspace,
+                delta_layer_end_lsn,
+                IoConcurrency::sequential(),
+                &ctx,
+            )
             .await
             .expect("No vectored errors");
         for (key, res) in results {
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 3913637ca0..c24d037dde 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -10,18 +10,26 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;
 
+use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
 use bytes::Bytes;
+use futures::stream::FuturesUnordered;
+use futures::StreamExt;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::value::Value;
-use std::cmp::{Ordering, Reverse};
+use std::cmp::Ordering;
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
+use std::future::Future;
 use std::ops::Range;
+use std::pin::Pin;
+use std::sync::atomic::AtomicUsize;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
+use tracing::{trace, Instrument};
+use utils::sync::gate::GateGuard;
 
 use utils::lsn::Lsn;
 
@@ -78,30 +86,151 @@ pub(crate) enum ValueReconstructSituation {
     Continue,
 }
 
-/// Reconstruct data accumulated for a single key during a vectored get
-#[derive(Debug, Default, Clone)]
-pub(crate) struct VectoredValueReconstructState {
-    pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
-    pub(crate) img: Option<(Lsn, Bytes)>,
-
-    situation: ValueReconstructSituation,
+/// On disk representation of a value loaded in a buffer
+#[derive(Debug)]
+pub(crate) enum OnDiskValue {
+    /// Unencoded [`Value::Image`]
+    RawImage(Bytes),
+    /// Encoded [`Value`]. Can deserialize into an image or a WAL record
+    WalRecordOrImage(Bytes),
 }
 
-impl VectoredValueReconstructState {
-    fn get_cached_lsn(&self) -> Option<Lsn> {
-        self.img.as_ref().map(|img| img.0)
+/// Reconstruct data accumulated for a single key during a vectored get
+#[derive(Debug, Default)]
+pub(crate) struct VectoredValueReconstructState {
+    pub(crate) on_disk_values: Vec<(Lsn, OnDiskValueIoWaiter)>,
+
+    pub(crate) situation: ValueReconstructSituation,
+}
+
+#[derive(Debug)]
+pub(crate) struct OnDiskValueIoWaiter {
+    rx: tokio::sync::oneshot::Receiver<OnDiskValueIoResult>,
+}
+
+#[derive(Debug)]
+#[must_use]
+pub(crate) enum OnDiskValueIo {
+    /// Traversal identified this IO as required to complete the vectored get.
+    Required {
+        num_active_ios: Arc<AtomicUsize>,
+        tx: tokio::sync::oneshot::Sender<OnDiskValueIoResult>,
+    },
+    /// Sparse keyspace reads always read all the values for a given key,
+    /// even though only the first value is needed.
+    ///
+    /// This variant represents the unnecessary IOs for those values at lower LSNs
+    /// that aren't needed, but are currently still being done.
+    ///
+    /// The execution of unnecessary IOs was a pre-existing behavior before concurrent IO.
+    /// We added this explicit representation here so that we can drop
+    /// unnecessary IO results immediately, instead of buffering them in
+    /// `oneshot` channels inside [`VectoredValueReconstructState`] until
+    /// [`VectoredValueReconstructState::collect_pending_ios`] gets called.
+    Unnecessary,
+}
+
+type OnDiskValueIoResult = Result<OnDiskValue, std::io::Error>;
+
+impl OnDiskValueIo {
+    pub(crate) fn complete(self, res: OnDiskValueIoResult) {
+        match self {
+            OnDiskValueIo::Required { num_active_ios, tx } => {
+                num_active_ios.fetch_sub(1, std::sync::atomic::Ordering::Release);
+                let _ = tx.send(res);
+            }
+            OnDiskValueIo::Unnecessary => {
+                // Nobody cared, see variant doc comment.
+            }
+        }
     }
 }
 
-impl From<VectoredValueReconstructState> for ValueReconstructState {
-    fn from(mut state: VectoredValueReconstructState) -> Self {
-        // walredo expects the records to be descending in terms of Lsn
-        state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));
+#[derive(Debug, thiserror::Error)]
+pub(crate) enum WaitCompletionError {
+    #[error("OnDiskValueIo was dropped without completing, likely the sidecar task panicked")]
+    IoDropped,
+}
 
-        ValueReconstructState {
-            records: state.records,
-            img: state.img,
+impl OnDiskValueIoWaiter {
+    pub(crate) async fn wait_completion(self) -> Result<OnDiskValueIoResult, WaitCompletionError> {
+        // NB: for Unnecessary IOs, this method never gets called because we don't add them to `on_disk_values`.
+        self.rx.await.map_err(|_| WaitCompletionError::IoDropped)
+    }
+}
+
+impl VectoredValueReconstructState {
+    /// # Cancel-Safety
+    ///
+    /// Technically fine to stop polling this future, but, the IOs will still
+    /// be executed to completion by the sidecar task and hold on to / consume resources.
+    /// Better not do it to make reasonsing about the system easier.
+    pub(crate) async fn collect_pending_ios(
+        self,
+    ) -> Result<ValueReconstructState, PageReconstructError> {
+        use utils::bin_ser::BeSer;
+
+        let mut res = Ok(ValueReconstructState::default());
+
+        // We should try hard not to bail early, so that by the time we return from this
+        // function, all IO for this value is done. It's not required -- we could totally
+        // stop polling the IO futures in the sidecar task, they need to support that,
+        // but just stopping to poll doesn't reduce the IO load on the disk. It's easier
+        // to reason about the system if we just wait for all IO to complete, even if
+        // we're no longer interested in the result.
+        //
+        // Revisit this when IO futures are replaced with a more sophisticated IO system
+        // and an IO scheduler, where we know which IOs were submitted and which ones
+        // just queued. Cf the comment on IoConcurrency::spawn_io.
+        for (lsn, waiter) in self.on_disk_values {
+            let value_recv_res = waiter
+                .wait_completion()
+                // we rely on the caller to poll us to completion, so this is not a bail point
+                .await;
+            // Force not bailing early by wrapping the code into a closure.
+            #[allow(clippy::redundant_closure_call)]
+            let _: () = (|| {
+                match (&mut res, value_recv_res) {
+                    (Err(_), _) => {
+                        // We've already failed, no need to process more.
+                    }
+                    (Ok(_), Err(wait_err)) => {
+                        // This shouldn't happen - likely the sidecar task panicked.
+                        res = Err(PageReconstructError::Other(wait_err.into()));
+                    }
+                    (Ok(_), Ok(Err(err))) => {
+                        let err: std::io::Error = err;
+                        // TODO: returning IO error here will fail a compute query.
+                        // Probably not what we want, we're not doing `maybe_fatal_err`
+                        // in the IO futures.
+                        // But it's been like that for a long time, not changing it
+                        // as part of concurrent IO.
+                        // => https://github.com/neondatabase/neon/issues/10454
+                        res = Err(PageReconstructError::Other(err.into()));
+                    }
+                    (Ok(ok), Ok(Ok(OnDiskValue::RawImage(img)))) => {
+                        assert!(ok.img.is_none());
+                        ok.img = Some((lsn, img));
+                    }
+                    (Ok(ok), Ok(Ok(OnDiskValue::WalRecordOrImage(buf)))) => {
+                        match Value::des(&buf) {
+                            Ok(Value::WalRecord(rec)) => {
+                                ok.records.push((lsn, rec));
+                            }
+                            Ok(Value::Image(img)) => {
+                                assert!(ok.img.is_none());
+                                ok.img = Some((lsn, img));
+                            }
+                            Err(err) => {
+                                res = Err(PageReconstructError::Other(err.into()));
+                            }
+                        }
+                    }
+                }
+            })();
         }
+
+        res
     }
 }
 
@@ -109,7 +238,7 @@ impl From<VectoredValueReconstructState> for ValueReconstructState {
 pub(crate) struct ValuesReconstructState {
     /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
     /// should not expect to get anything from this hashmap.
-    pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
+    pub(crate) keys: HashMap<Key, VectoredValueReconstructState>,
     /// The keys which are already retrieved
     keys_done: KeySpaceRandomAccum,
 
@@ -119,27 +248,365 @@ pub(crate) struct ValuesReconstructState {
     // Statistics that are still accessible as a caller of `get_vectored_impl`.
     layers_visited: u32,
     delta_layers_visited: u32,
+
+    pub(crate) io_concurrency: IoConcurrency,
+    num_active_ios: Arc<AtomicUsize>,
+}
+
+/// The level of IO concurrency to be used on the read path
+///
+/// The desired end state is that we always do parallel IO.
+/// This struct and the dispatching in the impl will be removed once
+/// we've built enough confidence.
+pub(crate) enum IoConcurrency {
+    Sequential,
+    SidecarTask {
+        task_id: usize,
+        ios_tx: tokio::sync::mpsc::UnboundedSender<IoFuture>,
+    },
+}
+
+type IoFuture = Pin<Box<dyn Send + Future<Output = ()>>>;
+
+pub(crate) enum SelectedIoConcurrency {
+    Sequential,
+    SidecarTask(GateGuard),
+}
+
+impl std::fmt::Debug for IoConcurrency {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            IoConcurrency::Sequential => write!(f, "Sequential"),
+            IoConcurrency::SidecarTask { .. } => write!(f, "SidecarTask"),
+        }
+    }
+}
+
+impl std::fmt::Debug for SelectedIoConcurrency {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            SelectedIoConcurrency::Sequential => write!(f, "Sequential"),
+            SelectedIoConcurrency::SidecarTask(_) => write!(f, "SidecarTask"),
+        }
+    }
+}
+
+impl IoConcurrency {
+    /// Force sequential IO. This is a temporary workaround until we have
+    /// moved plumbing-through-the-call-stack
+    /// of IoConcurrency into `RequestContextq.
+    ///
+    /// DO NOT USE for new code.
+    ///
+    /// Tracking issue: <https://github.com/neondatabase/neon/issues/10460>.
+    pub(crate) fn sequential() -> Self {
+        Self::spawn(SelectedIoConcurrency::Sequential)
+    }
+
+    pub(crate) fn spawn_from_conf(
+        conf: &'static PageServerConf,
+        gate_guard: GateGuard,
+    ) -> IoConcurrency {
+        use pageserver_api::config::GetVectoredConcurrentIo;
+        let selected = match conf.get_vectored_concurrent_io {
+            GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential,
+            GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard),
+        };
+        Self::spawn(selected)
+    }
+
+    pub(crate) fn spawn(io_concurrency: SelectedIoConcurrency) -> Self {
+        match io_concurrency {
+            SelectedIoConcurrency::Sequential => IoConcurrency::Sequential,
+            SelectedIoConcurrency::SidecarTask(gate_guard) => {
+                let (ios_tx, ios_rx) = tokio::sync::mpsc::unbounded_channel();
+                static TASK_ID: AtomicUsize = AtomicUsize::new(0);
+                let task_id = TASK_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+                // TODO: enrich the span with more context (tenant,shard,timeline) + (basebackup|pagestream|...)
+                let span =
+                    tracing::info_span!(parent: None, "IoConcurrency_sidecar", task_id = task_id);
+                trace!(task_id, "spawning sidecar task");
+                tokio::spawn(async move {
+                    trace!("start");
+                    scopeguard::defer!{ trace!("end") };
+                    type IosRx = tokio::sync::mpsc::UnboundedReceiver<IoFuture>;
+                    enum State {
+                        Waiting {
+                            // invariant: is_empty(), but we recycle the allocation
+                            empty_futures: FuturesUnordered<IoFuture>,
+                            ios_rx: IosRx,
+                        },
+                        Executing {
+                            futures: FuturesUnordered<IoFuture>,
+                            ios_rx: IosRx,
+                        },
+                        ShuttingDown {
+                            futures: FuturesUnordered<IoFuture>,
+                        },
+                    }
+                    let mut state = State::Waiting {
+                        empty_futures: FuturesUnordered::new(),
+                        ios_rx,
+                    };
+                    loop {
+                        match state {
+                            State::Waiting {
+                                empty_futures,
+                                mut ios_rx,
+                            } => {
+                                assert!(empty_futures.is_empty());
+                                tokio::select! {
+                                    fut = ios_rx.recv() => {
+                                        if let Some(fut) = fut {
+                                            trace!("received new io future");
+                                            empty_futures.push(fut);
+                                            state = State::Executing { futures: empty_futures, ios_rx };
+                                        } else {
+                                            state = State::ShuttingDown { futures: empty_futures }
+                                        }
+                                    }
+                                }
+                            }
+                            State::Executing {
+                                mut futures,
+                                mut ios_rx,
+                            } => {
+                                tokio::select! {
+                                    res = futures.next() => {
+                                        trace!("io future completed");
+                                        assert!(res.is_some());
+                                        if futures.is_empty() {
+                                            state = State::Waiting { empty_futures: futures, ios_rx};
+                                        } else {
+                                            state = State::Executing { futures, ios_rx };
+                                        }
+                                    }
+                                    fut = ios_rx.recv() => {
+                                        if let Some(fut) = fut {
+                                            trace!("received new io future");
+                                            futures.push(fut);
+                                            state =  State::Executing { futures, ios_rx};
+                                        } else {
+                                            state = State::ShuttingDown { futures };
+                                        }
+                                    }
+                                }
+                            }
+                            State::ShuttingDown {
+                                mut futures,
+                            } => {
+                                trace!("shutting down");
+                                while let Some(()) = futures.next().await {
+                                    trace!("io future completed (shutdown)");
+                                    // drain
+                                }
+                                trace!("shutdown complete");
+                                break;
+                            }
+                        }
+                    }
+                    drop(gate_guard); // drop it right before we exit
+                }.instrument(span));
+                IoConcurrency::SidecarTask { task_id, ios_tx }
+            }
+        }
+    }
+
+    pub(crate) fn clone(&self) -> Self {
+        match self {
+            IoConcurrency::Sequential => IoConcurrency::Sequential,
+            IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask {
+                task_id: *task_id,
+                ios_tx: ios_tx.clone(),
+            },
+        }
+    }
+
+    /// Submit an IO to be executed in the background. DEADLOCK RISK, read the full doc string.
+    ///
+    /// The IO is represented as an opaque future.
+    /// IO completion must be handled inside the future, e.g., through a oneshot channel.
+    ///
+    /// The API seems simple but there are multiple **pitfalls** involving
+    /// DEADLOCK RISK.
+    ///
+    /// First, there are no guarantees about the exexecution of the IO.
+    /// It may be `await`ed in-place before this function returns.
+    /// It may be polled partially by this task and handed off to another task to be finished.
+    /// It may be polled and then dropped before returning ready.
+    ///
+    /// This means that submitted IOs must not be interedependent.
+    /// Interdependence may be through shared limited resources, e.g.,
+    /// - VirtualFile file descriptor cache slot acquisition
+    /// - tokio-epoll-uring slot
+    ///
+    /// # Why current usage is safe from deadlocks
+    ///
+    /// Textbook condition for a deadlock is that _all_ of the following be given
+    /// - Mutual exclusion
+    /// - Hold and wait
+    /// - No preemption
+    /// - Circular wait
+    ///
+    /// The current usage is safe because:
+    /// - Mutual exclusion: IO futures definitely use mutexes, no way around that for now
+    /// - Hold and wait: IO futures currently hold two kinds of locks/resources while waiting
+    ///   for acquisition of other resources:
+    ///    - VirtualFile file descriptor cache slot tokio mutex
+    ///    - tokio-epoll-uring slot (uses tokio notify => wait queue, much like mutex)
+    /// - No preemption: there's no taking-away of acquired locks/resources => given
+    /// - Circular wait: this is the part of the condition that isn't met: all IO futures
+    ///   first acquire VirtualFile mutex, then tokio-epoll-uring slot.
+    ///   There is no IO future that acquires slot before VirtualFile.
+    ///   Hence there can be no circular waiting.
+    ///   Hence there cannot be a deadlock.
+    ///
+    /// This is a very fragile situation and must be revisited whenver any code called from
+    /// inside the IO futures is changed.
+    ///
+    /// We will move away from opaque IO futures towards well-defined IOs at some point in
+    /// the future when we have shipped this first version of concurrent IO to production
+    /// and are ready to retire the Sequential mode which runs the futures in place.
+    /// Right now, while brittle, the opaque IO approach allows us to ship the feature
+    /// with minimal changes to the code and minimal changes to existing behavior in Sequential mode.
+    ///
+    /// Also read the comment in `collect_pending_ios`.
+    pub(crate) async fn spawn_io<F>(&mut self, fut: F)
+    where
+        F: std::future::Future<Output = ()> + Send + 'static,
+    {
+        match self {
+            IoConcurrency::Sequential => fut.await,
+            IoConcurrency::SidecarTask { ios_tx, .. } => {
+                let fut = Box::pin(fut);
+                // NB: experiments showed that doing an opportunistic poll of `fut` here was bad for throughput
+                // while insignificant for latency.
+                // It would make sense to revisit the tokio-epoll-uring API in the future such that we can try
+                // a submission here, but never poll the future. That way, io_uring can make proccess while
+                // the future sits in the ios_tx queue.
+                match ios_tx.send(fut) {
+                    Ok(()) => {}
+                    Err(_) => {
+                        unreachable!("the io task must have exited, likely it panicked")
+                    }
+                }
+            }
+        }
+    }
+
+    #[cfg(test)]
+    pub(crate) fn spawn_for_test() -> impl std::ops::DerefMut<Target = Self> {
+        use std::ops::{Deref, DerefMut};
+        use tracing::info;
+        use utils::sync::gate::Gate;
+
+        // Spawn needs a Gate, give it one.
+        struct Wrapper {
+            inner: IoConcurrency,
+            #[allow(dead_code)]
+            gate: Box<Gate>,
+        }
+        impl Deref for Wrapper {
+            type Target = IoConcurrency;
+
+            fn deref(&self) -> &Self::Target {
+                &self.inner
+            }
+        }
+        impl DerefMut for Wrapper {
+            fn deref_mut(&mut self) -> &mut Self::Target {
+                &mut self.inner
+            }
+        }
+        let gate = Box::new(Gate::default());
+
+        // The default behavior when running Rust unit tests without any further
+        // flags is to use the new behavior.
+        // The CI uses the following environment variable to unit test both old
+        // and new behavior.
+        // NB: the Python regression & perf tests take the `else` branch
+        // below and have their own defaults management.
+        let selected = {
+            // The pageserver_api::config type is unsuitable because it's internally tagged.
+            #[derive(serde::Deserialize)]
+            #[serde(rename_all = "kebab-case")]
+            enum TestOverride {
+                Sequential,
+                SidecarTask,
+            }
+            use once_cell::sync::Lazy;
+            static TEST_OVERRIDE: Lazy<TestOverride> = Lazy::new(|| {
+                utils::env::var_serde_json_string(
+                    "NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO",
+                )
+                .unwrap_or(TestOverride::SidecarTask)
+            });
+
+            match *TEST_OVERRIDE {
+                TestOverride::Sequential => SelectedIoConcurrency::Sequential,
+                TestOverride::SidecarTask => {
+                    SelectedIoConcurrency::SidecarTask(gate.enter().expect("just created it"))
+                }
+            }
+        };
+
+        info!(?selected, "get_vectored_concurrent_io test");
+
+        Wrapper {
+            inner: Self::spawn(selected),
+            gate,
+        }
+    }
+}
+
+/// Make noise in case the [`ValuesReconstructState`] gets dropped while
+/// there are still IOs in flight.
+/// Refer to `collect_pending_ios` for why we prefer not to do that.
+//
+/// We log from here instead of from the sidecar task because the [`ValuesReconstructState`]
+/// gets dropped in a tracing span with more context.
+/// We repeat the sidecar tasks's `task_id` so we can correlate what we emit here with
+/// the logs / panic handler logs from the sidecar task, which also logs the `task_id`.
+impl Drop for ValuesReconstructState {
+    fn drop(&mut self) {
+        let num_active_ios = self
+            .num_active_ios
+            .load(std::sync::atomic::Ordering::Acquire);
+        if num_active_ios == 0 {
+            return;
+        }
+        let sidecar_task_id = match &self.io_concurrency {
+            IoConcurrency::Sequential => None,
+            IoConcurrency::SidecarTask { task_id, .. } => Some(*task_id),
+        };
+        tracing::warn!(
+            num_active_ios,
+            ?sidecar_task_id,
+            backtrace=%std::backtrace::Backtrace::force_capture(),
+            "dropping ValuesReconstructState while some IOs have not been completed",
+        );
+    }
 }
 
 impl ValuesReconstructState {
-    pub(crate) fn new() -> Self {
+    pub(crate) fn new(io_concurrency: IoConcurrency) -> Self {
         Self {
             keys: HashMap::new(),
             keys_done: KeySpaceRandomAccum::new(),
             keys_with_image_coverage: None,
             layers_visited: 0,
             delta_layers_visited: 0,
+            io_concurrency,
+            num_active_ios: Arc::new(AtomicUsize::new(0)),
         }
     }
 
-    /// Associate a key with the error which it encountered and mark it as done
-    pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) {
-        let previous = self.keys.insert(key, Err(err));
-        if let Some(Ok(state)) = previous {
-            if state.situation == ValueReconstructSituation::Continue {
-                self.keys_done.add_key(key);
-            }
-        }
+    /// Absolutely read [`IoConcurrency::spawn_io`] to learn about assumptions & pitfalls.
+    pub(crate) async fn spawn_io<F>(&mut self, fut: F)
+    where
+        F: std::future::Future<Output = ()> + Send + 'static,
+    {
+        self.io_concurrency.spawn_io(fut).await;
     }
 
     pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
@@ -159,29 +626,6 @@ impl ValuesReconstructState {
         self.layers_visited
     }
 
-    /// This function is called after reading a keyspace from a layer.
-    /// It checks if the read path has now moved past the cached Lsn for any keys.
-    ///
-    /// Implementation note: We intentionally iterate over the keys for which we've
-    /// already collected some reconstruct data. This avoids scaling complexity with
-    /// the size of the search space.
-    pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) {
-        for (key, value) in self.keys.iter_mut() {
-            if !keyspace.contains(key) {
-                continue;
-            }
-
-            if let Ok(state) = value {
-                if state.situation != ValueReconstructSituation::Complete
-                    && state.get_cached_lsn() >= Some(advanced_to)
-                {
-                    state.situation = ValueReconstructSituation::Complete;
-                    self.keys_done.add_key(*key);
-                }
-            }
-        }
-    }
-
     /// On hitting image layer, we can mark all keys in this range as done, because
     /// if the image layer does not contain a key, it is deleted/never added.
     pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range<Key>) {
@@ -199,70 +643,42 @@ impl ValuesReconstructState {
     ///
     /// If the key is in the sparse keyspace (i.e., aux files), we do not track them in
     /// `key_done`.
-    pub(crate) fn update_key(
-        &mut self,
-        key: &Key,
-        lsn: Lsn,
-        value: Value,
-    ) -> ValueReconstructSituation {
-        let state = self
-            .keys
-            .entry(*key)
-            .or_insert(Ok(VectoredValueReconstructState::default()));
+    // TODO: rename this method & update description.
+    pub(crate) fn update_key(&mut self, key: &Key, lsn: Lsn, completes: bool) -> OnDiskValueIo {
+        let state = self.keys.entry(*key).or_default();
+
         let is_sparse_key = key.is_sparse();
-        if let Ok(state) = state {
-            let key_done = match state.situation {
-                ValueReconstructSituation::Complete => {
-                    if is_sparse_key {
-                        // Sparse keyspace might be visited multiple times because
-                        // we don't track unmapped keyspaces.
-                        return ValueReconstructSituation::Complete;
-                    } else {
-                        unreachable!()
-                    }
-                }
-                ValueReconstructSituation::Continue => match value {
-                    Value::Image(img) => {
-                        state.img = Some((lsn, img));
-                        true
-                    }
-                    Value::WalRecord(rec) => {
-                        debug_assert!(
-                            Some(lsn) > state.get_cached_lsn(),
-                            "Attempt to collect a record below cached LSN for walredo: {} < {}",
-                            lsn,
-                            state
-                                .get_cached_lsn()
-                                .expect("Assertion can only fire if a cached lsn is present")
-                        );
 
-                        let will_init = rec.will_init();
-                        state.records.push((lsn, rec));
-                        will_init
-                    }
-                },
-            };
-
-            if key_done && state.situation == ValueReconstructSituation::Continue {
-                state.situation = ValueReconstructSituation::Complete;
-                if !is_sparse_key {
-                    self.keys_done.add_key(*key);
+        let required_io = match state.situation {
+            ValueReconstructSituation::Complete => {
+                if is_sparse_key {
+                    // Sparse keyspace might be visited multiple times because
+                    // we don't track unmapped keyspaces.
+                    return OnDiskValueIo::Unnecessary;
+                } else {
+                    unreachable!()
                 }
             }
+            ValueReconstructSituation::Continue => {
+                self.num_active_ios
+                    .fetch_add(1, std::sync::atomic::Ordering::Release);
+                let (tx, rx) = tokio::sync::oneshot::channel();
+                state.on_disk_values.push((lsn, OnDiskValueIoWaiter { rx }));
+                OnDiskValueIo::Required {
+                    tx,
+                    num_active_ios: Arc::clone(&self.num_active_ios),
+                }
+            }
+        };
 
-            state.situation
-        } else {
-            ValueReconstructSituation::Complete
+        if completes && state.situation == ValueReconstructSituation::Continue {
+            state.situation = ValueReconstructSituation::Complete;
+            if !is_sparse_key {
+                self.keys_done.add_key(*key);
+            }
         }
-    }
 
-    /// Returns the Lsn at which this key is cached if one exists.
-    /// The read path should go no further than this Lsn for the given key.
-    pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option<Lsn> {
-        self.keys
-            .get(key)
-            .and_then(|k| k.as_ref().ok())
-            .and_then(|state| state.get_cached_lsn())
+        required_io
     }
 
     /// Returns the key space describing the keys that have
@@ -276,12 +692,6 @@ impl ValuesReconstructState {
     }
 }
 
-impl Default for ValuesReconstructState {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 /// A key that uniquely identifies a layer in a timeline
 #[derive(Debug, PartialEq, Eq, Clone, Hash)]
 pub(crate) enum LayerId {
@@ -720,3 +1130,78 @@ impl<T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'_, T> {
         write!(f, "{}..{}", self.0.start, self.0.end)
     }
 }
+
+#[cfg(test)]
+mod tests2 {
+    use pageserver_api::key::DBDIR_KEY;
+    use tracing::info;
+
+    use super::*;
+    use crate::tenant::storage_layer::IoConcurrency;
+
+    /// TODO: currently this test relies on manual visual inspection of the --no-capture output.
+    /// Should look like so:
+    /// ```text
+    /// RUST_LOG=trace cargo nextest run  --features testing  --no-capture test_io_concurrency_noise
+    /// running 1 test
+    /// 2025-01-21T17:42:01.335679Z  INFO get_vectored_concurrent_io test selected=SidecarTask
+    /// 2025-01-21T17:42:01.335680Z TRACE spawning sidecar task task_id=0
+    /// 2025-01-21T17:42:01.335937Z TRACE IoConcurrency_sidecar{task_id=0}: start
+    /// 2025-01-21T17:42:01.335972Z TRACE IoConcurrency_sidecar{task_id=0}: received new io future
+    /// 2025-01-21T17:42:01.335999Z  INFO IoConcurrency_sidecar{task_id=0}: waiting for signal to complete IO
+    /// 2025-01-21T17:42:01.336229Z  WARN dropping ValuesReconstructState while some IOs have not been completed num_active_ios=1 sidecar_task_id=Some(0) backtrace=   0: <pageserver::tenant::storage_layer::ValuesReconstructState as core::ops::drop::Drop>::drop
+    ///              at ./src/tenant/storage_layer.rs:553:24
+    ///    1: core::ptr::drop_in_place<pageserver::tenant::storage_layer::ValuesReconstructState>
+    ///              at /home/christian/.rustup/toolchains/1.84.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/ptr/mod.rs:521:1
+    ///    2: core::mem::drop
+    ///              at /home/christian/.rustup/toolchains/1.84.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/mem/mod.rs:942:24
+    ///    3: pageserver::tenant::storage_layer::tests2::test_io_concurrency_noise::{{closure}}
+    ///              at ./src/tenant/storage_layer.rs:1159:9
+    ///   ...
+    ///   49: <unknown>
+    /// 2025-01-21T17:42:01.452293Z  INFO IoConcurrency_sidecar{task_id=0}: completing IO
+    /// 2025-01-21T17:42:01.452357Z TRACE IoConcurrency_sidecar{task_id=0}: io future completed
+    /// 2025-01-21T17:42:01.452473Z TRACE IoConcurrency_sidecar{task_id=0}: end
+    /// test tenant::storage_layer::tests2::test_io_concurrency_noise ... ok
+    ///
+    /// ```
+    #[tokio::test]
+    async fn test_io_concurrency_noise() {
+        crate::tenant::harness::setup_logging();
+
+        let io_concurrency = IoConcurrency::spawn_for_test();
+        match *io_concurrency {
+            IoConcurrency::Sequential => {
+                // This test asserts behavior in sidecar mode, doesn't make sense in sequential mode.
+                return;
+            }
+            IoConcurrency::SidecarTask { .. } => {}
+        }
+        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
+
+        let (io_fut_is_waiting_tx, io_fut_is_waiting) = tokio::sync::oneshot::channel();
+        let (do_complete_io, should_complete_io) = tokio::sync::oneshot::channel();
+        let (io_fut_exiting_tx, io_fut_exiting) = tokio::sync::oneshot::channel();
+
+        let io = reconstruct_state.update_key(&DBDIR_KEY, Lsn(8), true);
+        reconstruct_state
+            .spawn_io(async move {
+                info!("waiting for signal to complete IO");
+                io_fut_is_waiting_tx.send(()).unwrap();
+                should_complete_io.await.unwrap();
+                info!("completing IO");
+                io.complete(Ok(OnDiskValue::RawImage(Bytes::new())));
+                io_fut_exiting_tx.send(()).unwrap();
+            })
+            .await;
+
+        io_fut_is_waiting.await.unwrap();
+
+        // this is what makes the noise
+        drop(reconstruct_state);
+
+        do_complete_io.send(()).unwrap();
+
+        io_fut_exiting.await.unwrap();
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index ade1b794c6..885c50425f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -41,13 +41,12 @@ use crate::tenant::vectored_blob_io::{
     BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
     VectoredReadPlanner,
 };
-use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
@@ -60,7 +59,7 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_api::value::Value;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::VecDeque;
+use std::collections::{HashMap, VecDeque};
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -77,7 +76,10 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
+use super::{
+    AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer,
+    ValuesReconstructState,
+};
 
 ///
 /// Header stored in the beginning of the file
@@ -226,7 +228,7 @@ pub struct DeltaLayerInner {
     index_start_blk: u32,
     index_root_blk: u32,
 
-    file: VirtualFile,
+    file: Arc<VirtualFile>,
     file_id: FileId,
 
     layer_key_range: Range<Key>,
@@ -795,9 +797,11 @@ impl DeltaLayerInner {
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open_v2(path, ctx)
-            .await
-            .context("open layer file")?;
+        let file = Arc::new(
+            VirtualFile::open_v2(path, ctx)
+                .await
+                .context("open layer file")?,
+        );
 
         let file_id = page_cache::next_file_id();
 
@@ -842,12 +846,11 @@ impl DeltaLayerInner {
     // Look up the keys in the provided keyspace and update
     // the reconstruct state with whatever is found.
     //
-    // If the key is cached, go no further than the cached Lsn.
-    //
     // Currently, the index is visited for each range, but this
     // can be further optimised to visit the index only once.
     pub(super) async fn get_values_reconstruct_data(
         &self,
+        this: ResidentLayer,
         keyspace: KeySpace,
         lsn_range: Range<Lsn>,
         reconstruct_state: &mut ValuesReconstructState,
@@ -875,17 +878,14 @@ impl DeltaLayerInner {
             data_end_offset,
             index_reader,
             planner,
-            reconstruct_state,
             ctx,
         )
         .await
         .map_err(GetVectoredError::Other)?;
 
-        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
+        self.do_reads_and_update_state(this, reads, reconstruct_state, ctx)
             .await;
 
-        reconstruct_state.on_lsn_advanced(&keyspace, lsn_range.start);
-
         Ok(())
     }
 
@@ -895,7 +895,6 @@ impl DeltaLayerInner {
         data_end_offset: u64,
         index_reader: DiskBtreeReader<Reader, DELTA_KEY_SIZE>,
         mut planner: VectoredReadPlanner,
-        reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<VectoredRead>>
     where
@@ -922,10 +921,9 @@ impl DeltaLayerInner {
                 assert!(key >= range.start);
 
                 let outside_lsn_range = !lsn_range.contains(&lsn);
-                let below_cached_lsn = reconstruct_state.get_cached_lsn(&key) >= Some(lsn);
 
                 let flag = {
-                    if outside_lsn_range || below_cached_lsn {
+                    if outside_lsn_range {
                         BlobFlag::Ignore
                     } else if blob_ref.will_init() {
                         BlobFlag::ReplaceAll
@@ -994,98 +992,78 @@ impl DeltaLayerInner {
 
     async fn do_reads_and_update_state(
         &self,
+        this: ResidentLayer,
         reads: Vec<VectoredRead>,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) {
-        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
-        let mut ignore_key_with_err = None;
-
         let max_vectored_read_bytes = self
             .max_vectored_read_bytes
             .expect("Layer is loaded with max vectored bytes config")
             .0
             .into();
         let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
-        let mut buf = Some(IoBufferMut::with_capacity(buf_size));
 
         // Note that reads are processed in reverse order (from highest key+lsn).
         // This is the order that `ReconstructState` requires such that it can
         // track when a key is done.
         for read in reads.into_iter().rev() {
-            let res = vectored_blob_reader
-                .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx)
-                .await;
-
-            let blobs_buf = match res {
-                Ok(blobs_buf) => blobs_buf,
-                Err(err) => {
-                    let kind = err.kind();
-                    for (_, blob_meta) in read.blobs_at.as_slice() {
-                        reconstruct_state.on_key_error(
-                            blob_meta.key,
-                            PageReconstructError::Other(anyhow!(
-                                "Failed to read blobs from virtual file {}: {}",
-                                self.file.path(),
-                                kind
-                            )),
-                        );
-                    }
-
-                    // We have "lost" the buffer since the lower level IO api
-                    // doesn't return the buffer on error. Allocate a new one.
-                    buf = Some(IoBufferMut::with_capacity(buf_size));
-
-                    continue;
-                }
-            };
-            let view = BufView::new_slice(&blobs_buf.buf);
-            for meta in blobs_buf.blobs.iter().rev() {
-                if Some(meta.meta.key) == ignore_key_with_err {
-                    continue;
-                }
-                let blob_read = meta.read(&view).await;
-                let blob_read = match blob_read {
-                    Ok(buf) => buf,
-                    Err(e) => {
-                        reconstruct_state.on_key_error(
-                            meta.meta.key,
-                            PageReconstructError::Other(anyhow!(e).context(format!(
-                                "Failed to decompress blob from virtual file {}",
-                                self.file.path(),
-                            ))),
-                        );
-
-                        ignore_key_with_err = Some(meta.meta.key);
-                        continue;
-                    }
-                };
-
-                let value = Value::des(&blob_read);
-
-                let value = match value {
-                    Ok(v) => v,
-                    Err(e) => {
-                        reconstruct_state.on_key_error(
-                            meta.meta.key,
-                            PageReconstructError::Other(anyhow!(e).context(format!(
-                                "Failed to deserialize blob from virtual file {}",
-                                self.file.path(),
-                            ))),
-                        );
-
-                        ignore_key_with_err = Some(meta.meta.key);
-                        continue;
-                    }
-                };
-
-                // Invariant: once a key reaches [`ValueReconstructSituation::Complete`]
-                // state, no further updates shall be made to it. The call below will
-                // panic if the invariant is violated.
-                reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value);
+            let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
+            for (_, blob_meta) in read.blobs_at.as_slice().iter().rev() {
+                let io = reconstruct_state.update_key(
+                    &blob_meta.key,
+                    blob_meta.lsn,
+                    blob_meta.will_init,
+                );
+                ios.insert((blob_meta.key, blob_meta.lsn), io);
             }
 
-            buf = Some(blobs_buf.buf);
+            let read_extend_residency = this.clone();
+            let read_from = self.file.clone();
+            let read_ctx = ctx.attached_child();
+            reconstruct_state
+                .spawn_io(async move {
+                    let vectored_blob_reader = VectoredBlobReader::new(&read_from);
+                    let buf = IoBufferMut::with_capacity(buf_size);
+
+                    let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;
+                    match res {
+                        Ok(blobs_buf) => {
+                            let view = BufView::new_slice(&blobs_buf.buf);
+                            for meta in blobs_buf.blobs.iter().rev() {
+                                let io = ios.remove(&(meta.meta.key, meta.meta.lsn)).unwrap();
+
+                                let blob_read = meta.read(&view).await;
+                                let blob_read = match blob_read {
+                                    Ok(buf) => buf,
+                                    Err(e) => {
+                                        io.complete(Err(e));
+                                        continue;
+                                    }
+                                };
+
+                                io.complete(Ok(OnDiskValue::WalRecordOrImage(
+                                    blob_read.into_bytes(),
+                                )));
+                            }
+
+                            assert!(ios.is_empty());
+                        }
+                        Err(err) => {
+                            for (_, sender) in ios {
+                                sender.complete(Err(std::io::Error::new(
+                                    err.kind(),
+                                    "vec read failed",
+                                )));
+                            }
+                        }
+                    }
+
+                    // keep layer resident until this IO is done; this spawned IO future generally outlives the
+                    // call to `self` / the `Arc<DownloadedLayer>` / the `ResidentLayer` that guarantees residency
+                    drop(read_extend_residency);
+                })
+                .await;
         }
     }
 
@@ -1224,7 +1202,14 @@ impl DeltaLayerInner {
             let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
                 let end_offset = offset;
 
-                Some((BlobMeta { key, lsn }, start_offset..end_offset))
+                Some((
+                    BlobMeta {
+                        key,
+                        lsn,
+                        will_init: false,
+                    },
+                    start_offset..end_offset,
+                ))
             } else {
                 None
             };
@@ -1560,7 +1545,9 @@ impl DeltaLayerIterator<'_> {
                 let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
                 let blob_ref = BlobRef(value);
                 let offset = blob_ref.pos();
-                if let Some(batch_plan) = self.planner.handle(key, lsn, offset) {
+                if let Some(batch_plan) =
+                    self.planner.handle(key, lsn, offset, blob_ref.will_init())
+                {
                     break batch_plan;
                 }
             } else {
@@ -1673,7 +1660,6 @@ pub(crate) mod test {
             .expect("In memory disk finish should never fail");
         let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk);
         let planner = VectoredReadPlanner::new(100);
-        let mut reconstruct_state = ValuesReconstructState::new();
         let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
 
         let keyspace = KeySpace {
@@ -1691,7 +1677,6 @@ pub(crate) mod test {
             disk_offset,
             reader,
             planner,
-            &mut reconstruct_state,
             &ctx,
         )
         .await
@@ -1935,7 +1920,6 @@ pub(crate) mod test {
             );
 
             let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
-            let mut reconstruct_state = ValuesReconstructState::new();
             let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
             let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
 
@@ -1945,7 +1929,6 @@ pub(crate) mod test {
                 data_end_offset,
                 index_reader,
                 planner,
-                &mut reconstruct_state,
                 &ctx,
             )
             .await?;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 0d3c9d5a44..c49281dc45 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -38,12 +38,11 @@ use crate::tenant::vectored_blob_io::{
     BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
     VectoredReadPlanner,
 };
-use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
@@ -56,12 +55,13 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_api::value::Value;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::VecDeque;
+use std::collections::{HashMap, VecDeque};
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::str::FromStr;
+use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
 use tracing::*;
@@ -73,7 +73,10 @@ use utils::{
 };
 
 use super::layer_name::ImageLayerName;
-use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
+use super::{
+    AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer,
+    ValuesReconstructState,
+};
 
 ///
 /// Header stored in the beginning of the file
@@ -164,7 +167,7 @@ pub struct ImageLayerInner {
     key_range: Range<Key>,
     lsn: Lsn,
 
-    file: VirtualFile,
+    file: Arc<VirtualFile>,
     file_id: FileId,
 
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
@@ -391,9 +394,11 @@ impl ImageLayerInner {
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open_v2(path, ctx)
-            .await
-            .context("open layer file")?;
+        let file = Arc::new(
+            VirtualFile::open_v2(path, ctx)
+                .await
+                .context("open layer file")?,
+        );
         let file_id = page_cache::next_file_id();
         let block_reader = FileBlockReader::new(&file, file_id);
         let summary_blk = block_reader
@@ -439,6 +444,7 @@ impl ImageLayerInner {
     // the reconstruct state with whatever is found.
     pub(super) async fn get_values_reconstruct_data(
         &self,
+        this: ResidentLayer,
         keyspace: KeySpace,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
@@ -448,7 +454,7 @@ impl ImageLayerInner {
             .await
             .map_err(GetVectoredError::Other)?;
 
-        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
+        self.do_reads_and_update_state(this, reads, reconstruct_state, ctx)
             .await;
 
         reconstruct_state.on_image_layer_visited(&self.key_range);
@@ -570,6 +576,7 @@ impl ImageLayerInner {
 
     async fn do_reads_and_update_state(
         &self,
+        this: ResidentLayer,
         reads: Vec<VectoredRead>,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
@@ -580,8 +587,13 @@ impl ImageLayerInner {
             .0
             .into();
 
-        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
         for read in reads.into_iter() {
+            let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
+            for (_, blob_meta) in read.blobs_at.as_slice() {
+                let io = reconstruct_state.update_key(&blob_meta.key, blob_meta.lsn, true);
+                ios.insert((blob_meta.key, blob_meta.lsn), io);
+            }
+
             let buf_size = read.size();
 
             if buf_size > max_vectored_read_bytes {
@@ -611,50 +623,51 @@ impl ImageLayerInner {
                 }
             }
 
-            let buf = IoBufferMut::with_capacity(buf_size);
-            let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
+            let read_extend_residency = this.clone();
+            let read_from = self.file.clone();
+            let read_ctx = ctx.attached_child();
+            reconstruct_state
+                .spawn_io(async move {
+                    let buf = IoBufferMut::with_capacity(buf_size);
+                    let vectored_blob_reader = VectoredBlobReader::new(&read_from);
+                    let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;
 
-            match res {
-                Ok(blobs_buf) => {
-                    let view = BufView::new_slice(&blobs_buf.buf);
-                    for meta in blobs_buf.blobs.iter() {
-                        let img_buf = meta.read(&view).await;
+                    match res {
+                        Ok(blobs_buf) => {
+                            let view = BufView::new_slice(&blobs_buf.buf);
+                            for meta in blobs_buf.blobs.iter() {
+                                let io: OnDiskValueIo =
+                                    ios.remove(&(meta.meta.key, meta.meta.lsn)).unwrap();
+                                let img_buf = meta.read(&view).await;
 
-                        let img_buf = match img_buf {
-                            Ok(img_buf) => img_buf,
-                            Err(e) => {
-                                reconstruct_state.on_key_error(
-                                    meta.meta.key,
-                                    PageReconstructError::Other(anyhow!(e).context(format!(
-                                        "Failed to decompress blob from virtual file {}",
-                                        self.file.path(),
-                                    ))),
-                                );
+                                let img_buf = match img_buf {
+                                    Ok(img_buf) => img_buf,
+                                    Err(e) => {
+                                        io.complete(Err(e));
+                                        continue;
+                                    }
+                                };
 
-                                continue;
+                                io.complete(Ok(OnDiskValue::RawImage(img_buf.into_bytes())));
                             }
-                        };
-                        reconstruct_state.update_key(
-                            &meta.meta.key,
-                            self.lsn,
-                            Value::Image(img_buf.into_bytes()),
-                        );
+
+                            assert!(ios.is_empty());
+                        }
+                        Err(err) => {
+                            for (_, io) in ios {
+                                io.complete(Err(std::io::Error::new(
+                                    err.kind(),
+                                    "vec read failed",
+                                )));
+                            }
+                        }
                     }
-                }
-                Err(err) => {
-                    let kind = err.kind();
-                    for (_, blob_meta) in read.blobs_at.as_slice() {
-                        reconstruct_state.on_key_error(
-                            blob_meta.key,
-                            PageReconstructError::from(anyhow!(
-                                "Failed to read blobs from virtual file {}: {}",
-                                self.file.path(),
-                                kind
-                            )),
-                        );
-                    }
-                }
-            };
+
+                    // keep layer resident until this IO is done; this spawned IO future generally outlives the
+                    // call to `self` / the `Arc<DownloadedLayer>` / the `ResidentLayer` that guarantees residency
+                    drop(read_extend_residency);
+                })
+                .await;
         }
     }
 
@@ -1069,6 +1082,7 @@ impl ImageLayerIterator<'_> {
                     Key::from_slice(&raw_key[..KEY_SIZE]),
                     self.image_layer.lsn,
                     offset,
+                    true,
                 ) {
                     break batch_plan;
                 }
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 2b67f55a17..61a0fdea8c 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -8,23 +8,22 @@ use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::tenant::ephemeral_file::EphemeralFile;
+use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo};
 use crate::tenant::timeline::GetVectoredError;
-use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache};
-use anyhow::{anyhow, Result};
+use anyhow::Result;
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use pageserver_api::value::Value;
 use std::collections::{BTreeMap, HashMap};
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
 use tracing::*;
-use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
+use utils::{id::TimelineId, lsn::Lsn, vec_map::VecMap};
 use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta};
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
@@ -36,9 +35,7 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::RwLock;
 
-use super::{
-    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
-};
+use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState};
 
 pub(crate) mod vectored_dio_read;
 
@@ -415,10 +412,8 @@ impl InMemoryLayer {
 
     // Look up the keys in the provided keyspace and update
     // the reconstruct state with whatever is found.
-    //
-    // If the key is cached, go no further than the cached Lsn.
     pub(crate) async fn get_values_reconstruct_data(
-        &self,
+        self: &Arc<InMemoryLayer>,
         keyspace: KeySpace,
         end_lsn: Lsn,
         reconstruct_state: &mut ValuesReconstructState,
@@ -435,6 +430,9 @@ impl InMemoryLayer {
             read: vectored_dio_read::LogicalRead<Vec<u8>>,
         }
         let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
+        let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
+
+        let lsn_range = self.start_lsn..end_lsn;
 
         for range in keyspace.ranges.iter() {
             for (key, vec_map) in inner
@@ -442,12 +440,7 @@ impl InMemoryLayer {
                 .range(range.start.to_compact()..range.end.to_compact())
             {
                 let key = Key::from_compact(*key);
-                let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
-                    Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
-                    None => self.start_lsn..end_lsn,
-                };
-
-                let slice = vec_map.slice_range(lsn_range);
+                let slice = vec_map.slice_range(lsn_range.clone());
 
                 for (entry_lsn, index_entry) in slice.iter().rev() {
                     let IndexEntryUnpacked {
@@ -463,55 +456,59 @@ impl InMemoryLayer {
                             Vec::with_capacity(len as usize),
                         ),
                     });
+
+                    let io = reconstruct_state.update_key(&key, *entry_lsn, will_init);
+                    ios.insert((key, *entry_lsn), io);
+
                     if will_init {
                         break;
                     }
                 }
             }
         }
+        drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
+        let read_from = Arc::clone(self);
+        let read_ctx = ctx.attached_child();
+        reconstruct_state
+            .spawn_io(async move {
+                let inner = read_from.inner.read().await;
+                let f = vectored_dio_read::execute(
+                    &inner.file,
+                    reads
+                        .iter()
+                        .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
+                    &read_ctx,
+                );
+                send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
+                    .await;
 
-        // Execute the reads.
-
-        let f = vectored_dio_read::execute(
-            &inner.file,
-            reads
-                .iter()
-                .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
-            &ctx,
-        );
-        send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
-            .await;
-
-        // Process results into the reconstruct state
-        'next_key: for (key, value_reads) in reads {
-            for ValueRead { entry_lsn, read } in value_reads {
-                match read.into_result().expect("we run execute() above") {
-                    Err(e) => {
-                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
-                        continue 'next_key;
-                    }
-                    Ok(value_buf) => {
-                        let value = Value::des(&value_buf);
-                        if let Err(e) = value {
-                            reconstruct_state
-                                .on_key_error(key, PageReconstructError::from(anyhow!(e)));
-                            continue 'next_key;
+                for (key, value_reads) in reads {
+                    for ValueRead { entry_lsn, read } in value_reads {
+                        let io = ios.remove(&(key, entry_lsn)).expect("sender must exist");
+                        match read.into_result().expect("we run execute() above") {
+                            Err(e) => {
+                                io.complete(Err(std::io::Error::new(
+                                    e.kind(),
+                                    "dio vec read failed",
+                                )));
+                            }
+                            Ok(value_buf) => {
+                                io.complete(Ok(OnDiskValue::WalRecordOrImage(value_buf.into())));
+                            }
                         }
-
-                        let key_situation =
-                            reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
-                        if key_situation == ValueReconstructSituation::Complete {
-                            // TODO: metric to see if we fetched more values than necessary
-                            continue 'next_key;
-                        }
-
-                        // process the next value in the next iteration of the loop
                     }
                 }
-            }
-        }
 
-        reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
+                assert!(ios.is_empty());
+
+                // Keep layer existent until this IO is done;
+                // This is kinda forced for InMemoryLayer because we need to inner.read() anyway,
+                // but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit
+                // drop for consistency among all three layer types.
+                drop(inner);
+                drop(read_from);
+            })
+            .await;
 
         Ok(())
     }
@@ -606,6 +603,7 @@ impl InMemoryLayer {
         // Write the batch to the file
         inner.file.write_raw(&raw, ctx).await?;
         let new_size = inner.file.len();
+
         let expected_new_len = base_offset
             .checked_add(raw.len().into_u64())
             // write_raw would error if we were to overflow u64.
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 2b06c88e8b..2a86885f6b 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -308,7 +308,7 @@ impl Layer {
         reconstruct_data: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
-        let layer = self
+        let downloaded = self
             .0
             .get_or_maybe_download(true, Some(ctx))
             .await
@@ -318,11 +318,15 @@ impl Layer {
                 }
                 other => GetVectoredError::Other(anyhow::anyhow!(other)),
             })?;
+        let this = ResidentLayer {
+            downloaded: downloaded.clone(),
+            owner: self.clone(),
+        };
 
         self.record_access(ctx);
 
-        layer
-            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
+        downloaded
+            .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx)
             .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
             .await
             .map_err(|err| match err {
@@ -1768,25 +1772,25 @@ impl DownloadedLayer {
 
     async fn get_values_reconstruct_data(
         &self,
+        this: ResidentLayer,
         keyspace: KeySpace,
         lsn_range: Range<Lsn>,
         reconstruct_data: &mut ValuesReconstructState,
-        owner: &Arc<LayerInner>,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
         use LayerKind::*;
 
         match self
-            .get(owner, ctx)
+            .get(&this.owner.0, ctx)
             .await
             .map_err(GetVectoredError::Other)?
         {
             Delta(d) => {
-                d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
+                d.get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx)
                     .await
             }
             Image(i) => {
-                i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx)
+                i.get_values_reconstruct_data(this, keyspace, reconstruct_data, ctx)
                     .await
             }
         }
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index fcb73ad20d..d93c378ffc 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -11,7 +11,10 @@ use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
 use crate::{
     context::DownloadBehavior,
-    tenant::{harness::test_img, storage_layer::LayerVisibilityHint},
+    tenant::{
+        harness::test_img,
+        storage_layer::{IoConcurrency, LayerVisibilityHint},
+    },
 };
 use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
 
@@ -31,6 +34,7 @@ async fn smoke_test() {
     let span = h.span();
     let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
     let (tenant, _) = h.load().await;
+    let io_concurrency = IoConcurrency::spawn_for_test();
 
     let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
 
@@ -89,7 +93,7 @@ async fn smoke_test() {
     };
 
     let img_before = {
-        let mut data = ValuesReconstructState::default();
+        let mut data = ValuesReconstructState::new(io_concurrency.clone());
         layer
             .get_values_reconstruct_data(
                 controlfile_keyspace.clone(),
@@ -99,10 +103,13 @@ async fn smoke_test() {
             )
             .await
             .unwrap();
+
         data.keys
             .remove(&CONTROLFILE_KEY)
             .expect("must be present")
-            .expect("should not error")
+            .collect_pending_ios()
+            .await
+            .expect("must not error")
             .img
             .take()
             .expect("tenant harness writes the control file")
@@ -121,7 +128,7 @@ async fn smoke_test() {
 
     // on accesses when the layer is evicted, it will automatically be downloaded.
     let img_after = {
-        let mut data = ValuesReconstructState::default();
+        let mut data = ValuesReconstructState::new(io_concurrency.clone());
         layer
             .get_values_reconstruct_data(
                 controlfile_keyspace.clone(),
@@ -135,7 +142,9 @@ async fn smoke_test() {
         data.keys
             .remove(&CONTROLFILE_KEY)
             .expect("must be present")
-            .expect("should not error")
+            .collect_pending_ios()
+            .await
+            .expect("must not error")
             .img
             .take()
             .expect("tenant harness writes the control file")
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e83b516d79..5d348ac474 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -20,6 +20,7 @@ use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
+use futures::{stream::FuturesUnordered, StreamExt};
 use handle::ShardTimelineId;
 use offload::OffloadError;
 use once_cell::sync::Lazy;
@@ -74,6 +75,7 @@ use std::{
     ops::{Deref, Range},
 };
 
+use crate::l0_flush::{self, L0FlushGlobalState};
 use crate::{
     aux_file::AuxFileSizeEstimator,
     page_service::TenantManagerTypes,
@@ -81,7 +83,10 @@ use crate::{
         config::AttachmentMode,
         layer_map::{LayerMap, SearchResult},
         metadata::TimelineMetadata,
-        storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
+        storage_layer::{
+            inmemory_layer::IndexEntry, IoConcurrency, PersistentLayerDesc,
+            ValueReconstructSituation,
+        },
     },
     walingest::WalLagCooldown,
     walredo,
@@ -102,10 +107,6 @@ use crate::{
 use crate::{
     disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
 };
-use crate::{
-    l0_flush::{self, L0FlushGlobalState},
-    metrics::GetKind,
-};
 use crate::{
     metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
@@ -1005,9 +1006,7 @@ impl Timeline {
             ranges: vec![key..key.next()],
         };
 
-        // Initialise the reconstruct state for the key with the cache
-        // entry returned above.
-        let mut reconstruct_state = ValuesReconstructState::new();
+        let mut reconstruct_state = ValuesReconstructState::new(IoConcurrency::sequential());
 
         let vectored_res = self
             .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
@@ -1050,6 +1049,7 @@ impl Timeline {
         &self,
         keyspace: KeySpace,
         lsn: Lsn,
+        io_concurrency: super::storage_layer::IoConcurrency,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
         if !lsn.is_valid() {
@@ -1084,7 +1084,7 @@ impl Timeline {
             .get_vectored_impl(
                 keyspace.clone(),
                 lsn,
-                &mut ValuesReconstructState::new(),
+                &mut ValuesReconstructState::new(io_concurrency),
                 ctx,
             )
             .await;
@@ -1109,6 +1109,7 @@ impl Timeline {
         keyspace: KeySpace,
         lsn: Lsn,
         ctx: &RequestContext,
+        io_concurrency: super::storage_layer::IoConcurrency,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
         if !lsn.is_valid() {
             return Err(GetVectoredError::InvalidLsn(lsn));
@@ -1140,7 +1141,7 @@ impl Timeline {
             .get_vectored_impl(
                 keyspace.clone(),
                 lsn,
-                &mut ValuesReconstructState::default(),
+                &mut ValuesReconstructState::new(io_concurrency),
                 ctx,
             )
             .await;
@@ -1159,39 +1160,56 @@ impl Timeline {
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        let get_kind = if keyspace.total_raw_size() == 1 {
-            GetKind::Singular
-        } else {
-            GetKind::Vectored
+        let traversal_res: Result<(), _> = self
+            .get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
+            .await;
+        if let Err(err) = traversal_res {
+            // Wait for all the spawned IOs to complete.
+            // See comments on `spawn_io` inside `storage_layer` for more details.
+            let mut collect_futs = std::mem::take(&mut reconstruct_state.keys)
+                .into_values()
+                .map(|state| state.collect_pending_ios())
+                .collect::<FuturesUnordered<_>>();
+            while collect_futs.next().await.is_some() {}
+            return Err(err);
         };
 
-        let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
-            .for_get_kind(get_kind)
-            .start_timer();
-        self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
-            .await?;
-        get_data_timer.stop_and_record();
-
-        let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME
-            .for_get_kind(get_kind)
-            .start_timer();
-        let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
         let layers_visited = reconstruct_state.get_layers_visited();
 
-        for (key, res) in std::mem::take(&mut reconstruct_state.keys) {
-            match res {
-                Err(err) => {
-                    results.insert(key, Err(err));
-                }
-                Ok(state) => {
-                    let state = ValueReconstructState::from(state);
+        let futs = FuturesUnordered::new();
+        for (key, state) in std::mem::take(&mut reconstruct_state.keys) {
+            futs.push({
+                let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
+                async move {
+                    assert_eq!(state.situation, ValueReconstructSituation::Complete);
 
-                    let reconstruct_res = self.reconstruct_value(key, lsn, state).await;
-                    results.insert(key, reconstruct_res);
+                    let converted = match state.collect_pending_ios().await {
+                        Ok(ok) => ok,
+                        Err(err) => {
+                            return (key, Err(err));
+                        }
+                    };
+
+                    // The walredo module expects the records to be descending in terms of Lsn.
+                    // And we submit the IOs in that order, so, there shuold be no need to sort here.
+                    debug_assert!(
+                        converted
+                            .records
+                            .is_sorted_by_key(|(lsn, _)| std::cmp::Reverse(*lsn)),
+                        "{converted:?}"
+                    );
+
+                    (
+                        key,
+                        walredo_self.reconstruct_value(key, lsn, converted).await,
+                    )
                 }
-            }
+            });
         }
-        reconstruct_timer.stop_and_record();
+
+        let results = futs
+            .collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>()
+            .await;
 
         // For aux file keys (v1 or v2) the vectored read path does not return an error
         // when they're missing. Instead they are omitted from the resulting btree
@@ -2873,6 +2891,14 @@ impl Timeline {
                     crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
                 };
 
+                let io_concurrency = IoConcurrency::spawn_from_conf(
+                    self_ref.conf,
+                    self_ref
+                        .gate
+                        .enter()
+                        .map_err(|_| CalculateLogicalSizeError::Cancelled)?,
+                );
+
                 let calculated_size = self_ref
                     .logical_size_calculation_task(
                         initial_part_end,
@@ -2882,7 +2908,11 @@ impl Timeline {
                     .await?;
 
                 self_ref
-                    .trigger_aux_file_size_computation(initial_part_end, background_ctx)
+                    .trigger_aux_file_size_computation(
+                        initial_part_end,
+                        background_ctx,
+                        io_concurrency,
+                    )
                     .await?;
 
                 // TODO: add aux file size to logical size
@@ -4115,6 +4145,7 @@ impl Timeline {
 
     /// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large,
     /// so that at most one image layer will be produced from this function.
+    #[allow(clippy::too_many_arguments)]
     async fn create_image_layer_for_rel_blocks(
         self: &Arc<Self>,
         partition: &KeySpace,
@@ -4123,6 +4154,7 @@ impl Timeline {
         ctx: &RequestContext,
         img_range: Range<Key>,
         start: Key,
+        io_concurrency: IoConcurrency,
     ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
         let mut wrote_keys = false;
 
@@ -4151,7 +4183,12 @@ impl Timeline {
                     || (last_key_in_range && key_request_accum.raw_size() > 0)
                 {
                     let results = self
-                        .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
+                        .get_vectored(
+                            key_request_accum.consume_keyspace(),
+                            lsn,
+                            io_concurrency.clone(),
+                            ctx,
+                        )
                         .await?;
 
                     if self.cancel.is_cancelled() {
@@ -4230,9 +4267,10 @@ impl Timeline {
         img_range: Range<Key>,
         mode: ImageLayerCreationMode,
         start: Key,
+        io_concurrency: IoConcurrency,
     ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
         // Metadata keys image layer creation.
-        let mut reconstruct_state = ValuesReconstructState::default();
+        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
         let begin = Instant::now();
         let data = self
             .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
@@ -4449,6 +4487,13 @@ impl Timeline {
                 )))
             });
 
+            let io_concurrency = IoConcurrency::spawn_from_conf(
+                self.conf,
+                self.gate
+                    .enter()
+                    .map_err(|_| CreateImageLayersError::Cancelled)?,
+            );
+
             if !compact_metadata {
                 let ImageLayerCreationOutcome {
                     image,
@@ -4461,6 +4506,7 @@ impl Timeline {
                         ctx,
                         img_range,
                         start,
+                        io_concurrency,
                     )
                     .await?;
 
@@ -4479,6 +4525,7 @@ impl Timeline {
                         img_range,
                         mode,
                         start,
+                        io_concurrency,
                     )
                     .await?;
                 start = next_start_key;
@@ -5746,13 +5793,14 @@ impl Timeline {
         self: &Arc<Timeline>,
         lsn: Lsn,
         ctx: &RequestContext,
+        io_concurrency: IoConcurrency,
     ) -> anyhow::Result<Vec<(Key, Bytes)>> {
         let mut all_data = Vec::new();
         let guard = self.layers.read().await;
         for layer in guard.layer_map()?.iter_historic_layers() {
             if !layer.is_delta() && layer.image_layer_lsn() == lsn {
                 let layer = guard.get_from_desc(&layer);
-                let mut reconstruct_data = ValuesReconstructState::default();
+                let mut reconstruct_data = ValuesReconstructState::new(io_concurrency.clone());
                 layer
                     .get_values_reconstruct_data(
                         KeySpace::single(Key::MIN..Key::MAX),
@@ -5761,8 +5809,9 @@ impl Timeline {
                         ctx,
                     )
                     .await?;
-                for (k, v) in reconstruct_data.keys {
-                    all_data.push((k, v?.img.unwrap().1));
+                for (k, v) in std::mem::take(&mut reconstruct_data.keys) {
+                    let v = v.collect_pending_ios().await?;
+                    all_data.push((k, v.img.unwrap().1));
                 }
             }
         }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 06a21f6b3c..57fc415d06 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -42,8 +42,8 @@ use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::{
     AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
 };
-use crate::tenant::timeline::ImageLayerCreationOutcome;
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
+use crate::tenant::timeline::{ImageLayerCreationOutcome, IoConcurrency};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -3170,6 +3170,7 @@ impl TimelineAdaptor {
                 ctx,
                 key_range.clone(),
                 start,
+                IoConcurrency::sequential(),
             )
             .await?;
 
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index dfe2352310..47fb4a276b 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -35,6 +35,7 @@ use crate::virtual_file::{self, VirtualFile};
 pub struct BlobMeta {
     pub key: Key,
     pub lsn: Lsn,
+    pub will_init: bool,
 }
 
 /// A view into the vectored blobs read buffer.
@@ -310,7 +311,15 @@ pub enum BlobFlag {
 /// * Iterate over the collected blobs and coalesce them into reads at the end
 pub struct VectoredReadPlanner {
     // Track all the blob offsets. Start offsets must be ordered.
-    blobs: BTreeMap<Key, Vec<(Lsn, u64, u64)>>,
+    // Values in the value tuples are:
+    // (
+    //   lsn of the blob,
+    //   start offset of the blob in the underlying file,
+    //   end offset of the blob in the underlying file,
+    //   whether the blob initializes the page image or not
+    //   see [`pageserver_api::record::NeonWalRecord::will_init`]
+    // )
+    blobs: BTreeMap<Key, Vec<(Lsn, u64, u64, bool)>>,
     // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
     prev: Option<(Key, Lsn, u64, BlobFlag)>,
 
@@ -371,12 +380,12 @@ impl VectoredReadPlanner {
         match flag {
             BlobFlag::None => {
                 let blobs_for_key = self.blobs.entry(key).or_default();
-                blobs_for_key.push((lsn, start_offset, end_offset));
+                blobs_for_key.push((lsn, start_offset, end_offset, false));
             }
             BlobFlag::ReplaceAll => {
                 let blobs_for_key = self.blobs.entry(key).or_default();
                 blobs_for_key.clear();
-                blobs_for_key.push((lsn, start_offset, end_offset));
+                blobs_for_key.push((lsn, start_offset, end_offset, true));
             }
             BlobFlag::Ignore => {}
         }
@@ -387,11 +396,17 @@ impl VectoredReadPlanner {
         let mut reads = Vec::new();
 
         for (key, blobs_for_key) in self.blobs {
-            for (lsn, start_offset, end_offset) in blobs_for_key {
+            for (lsn, start_offset, end_offset, will_init) in blobs_for_key {
                 let extended = match &mut current_read_builder {
-                    Some(read_builder) => {
-                        read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn })
-                    }
+                    Some(read_builder) => read_builder.extend(
+                        start_offset,
+                        end_offset,
+                        BlobMeta {
+                            key,
+                            lsn,
+                            will_init,
+                        },
+                    ),
                     None => VectoredReadExtended::No,
                 };
 
@@ -399,7 +414,11 @@ impl VectoredReadPlanner {
                     let next_read_builder = ChunkedVectoredReadBuilder::new(
                         start_offset,
                         end_offset,
-                        BlobMeta { key, lsn },
+                        BlobMeta {
+                            key,
+                            lsn,
+                            will_init,
+                        },
                         self.max_read_size,
                     );
 
@@ -527,7 +546,7 @@ impl<'a> VectoredBlobReader<'a> {
 pub struct StreamingVectoredReadPlanner {
     read_builder: Option<ChunkedVectoredReadBuilder>,
     // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
-    prev: Option<(Key, Lsn, u64)>,
+    prev: Option<(Key, Lsn, u64, bool)>,
     /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150,
     /// we will produce a single batch instead of split them.
     max_read_size: u64,
@@ -550,27 +569,47 @@ impl StreamingVectoredReadPlanner {
         }
     }
 
-    pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64) -> Option<VectoredRead> {
+    pub fn handle(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        offset: u64,
+        will_init: bool,
+    ) -> Option<VectoredRead> {
         // Implementation note: internally lag behind by one blob such that
         // we have a start and end offset when initialising [`VectoredRead`]
-        let (prev_key, prev_lsn, prev_offset) = match self.prev {
+        let (prev_key, prev_lsn, prev_offset, prev_will_init) = match self.prev {
             None => {
-                self.prev = Some((key, lsn, offset));
+                self.prev = Some((key, lsn, offset, will_init));
                 return None;
             }
             Some(prev) => prev,
         };
 
-        let res = self.add_blob(prev_key, prev_lsn, prev_offset, offset, false);
+        let res = self.add_blob(
+            prev_key,
+            prev_lsn,
+            prev_offset,
+            offset,
+            false,
+            prev_will_init,
+        );
 
-        self.prev = Some((key, lsn, offset));
+        self.prev = Some((key, lsn, offset, will_init));
 
         res
     }
 
     pub fn handle_range_end(&mut self, offset: u64) -> Option<VectoredRead> {
-        let res = if let Some((prev_key, prev_lsn, prev_offset)) = self.prev {
-            self.add_blob(prev_key, prev_lsn, prev_offset, offset, true)
+        let res = if let Some((prev_key, prev_lsn, prev_offset, prev_will_init)) = self.prev {
+            self.add_blob(
+                prev_key,
+                prev_lsn,
+                prev_offset,
+                offset,
+                true,
+                prev_will_init,
+            )
         } else {
             None
         };
@@ -587,10 +626,19 @@ impl StreamingVectoredReadPlanner {
         start_offset: u64,
         end_offset: u64,
         is_last_blob_in_read: bool,
+        will_init: bool,
     ) -> Option<VectoredRead> {
         match &mut self.read_builder {
             Some(read_builder) => {
-                let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn });
+                let extended = read_builder.extend(
+                    start_offset,
+                    end_offset,
+                    BlobMeta {
+                        key,
+                        lsn,
+                        will_init,
+                    },
+                );
                 assert_eq!(extended, VectoredReadExtended::Yes);
             }
             None => {
@@ -598,7 +646,11 @@ impl StreamingVectoredReadPlanner {
                     Some(ChunkedVectoredReadBuilder::new_streaming(
                         start_offset,
                         end_offset,
-                        BlobMeta { key, lsn },
+                        BlobMeta {
+                            key,
+                            lsn,
+                            will_init,
+                        },
                     ))
                 };
             }
@@ -812,7 +864,7 @@ mod tests {
         let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1000);
         let mut reads = Vec::new();
         for (key, lsn, offset, _) in blob_descriptions.clone() {
-            reads.extend(planner.handle(key, lsn, offset));
+            reads.extend(planner.handle(key, lsn, offset, false));
         }
         reads.extend(planner.handle_range_end(652 * 1024));
 
@@ -850,7 +902,7 @@ mod tests {
         let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
         let mut reads = Vec::new();
         for (key, lsn, offset, _) in blob_descriptions.clone() {
-            reads.extend(planner.handle(key, lsn, offset));
+            reads.extend(planner.handle(key, lsn, offset, false));
         }
         reads.extend(planner.handle_range_end(652 * 1024));
 
@@ -875,7 +927,7 @@ mod tests {
         {
             let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
             let mut reads = Vec::new();
-            reads.extend(planner.handle(key, lsn, 0));
+            reads.extend(planner.handle(key, lsn, 0, false));
             reads.extend(planner.handle_range_end(652 * 1024));
             assert_eq!(reads.len(), 1);
             validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
@@ -883,8 +935,8 @@ mod tests {
         {
             let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
             let mut reads = Vec::new();
-            reads.extend(planner.handle(key, lsn, 0));
-            reads.extend(planner.handle(key, lsn, 128 * 1024));
+            reads.extend(planner.handle(key, lsn, 0, false));
+            reads.extend(planner.handle(key, lsn, 128 * 1024, false));
             reads.extend(planner.handle_range_end(652 * 1024));
             assert_eq!(reads.len(), 2);
             validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
@@ -893,8 +945,8 @@ mod tests {
         {
             let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
             let mut reads = Vec::new();
-            reads.extend(planner.handle(key, lsn, 0));
-            reads.extend(planner.handle(key, lsn, 128 * 1024));
+            reads.extend(planner.handle(key, lsn, 0, false));
+            reads.extend(planner.handle(key, lsn, 128 * 1024, false));
             reads.extend(planner.handle_range_end(652 * 1024));
             assert_eq!(reads.len(), 1);
             validate_read(
@@ -923,6 +975,7 @@ mod tests {
         let meta = BlobMeta {
             key: Key::MIN,
             lsn: Lsn(0),
+            will_init: false,
         };
 
         for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index ad7bcc0714..e0283d99e0 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -499,7 +499,13 @@ impl WalIngest {
 
                 let content = modification
                     .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx)
+                    .get_rel_page_at_lsn(
+                        src_rel,
+                        blknum,
+                        Version::Modified(modification),
+                        ctx,
+                        crate::tenant::storage_layer::IoConcurrency::sequential(),
+                    )
                     .await?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
@@ -1489,6 +1495,7 @@ mod tests {
     use super::*;
     use crate::tenant::harness::*;
     use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
+    use crate::tenant::storage_layer::IoConcurrency;
     use postgres_ffi::RELSEG_SIZE;
 
     use crate::DEFAULT_PG_VERSION;
@@ -1532,6 +1539,7 @@ mod tests {
     #[tokio::test]
     async fn test_relsize() -> Result<()> {
         let (tenant, ctx) = TenantHarness::create("test_relsize").await?.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -1599,7 +1607,13 @@ mod tests {
         // Check page contents at each LSN
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    0,
+                    Version::Lsn(Lsn(0x20)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 0 at 2")
@@ -1607,7 +1621,13 @@ mod tests {
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    0,
+                    Version::Lsn(Lsn(0x30)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 0 at 3")
@@ -1615,14 +1635,26 @@ mod tests {
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    0,
+                    Version::Lsn(Lsn(0x40)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    1,
+                    Version::Lsn(Lsn(0x40)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 1 at 4")
@@ -1630,21 +1662,39 @@ mod tests {
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    0,
+                    Version::Lsn(Lsn(0x50)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    1,
+                    Version::Lsn(Lsn(0x50)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 1 at 4")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    2,
+                    Version::Lsn(Lsn(0x50)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 2 at 5")
@@ -1667,14 +1717,26 @@ mod tests {
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    0,
+                    Version::Lsn(Lsn(0x60)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    1,
+                    Version::Lsn(Lsn(0x60)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 1 at 4")
@@ -1689,7 +1751,13 @@ mod tests {
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    2,
+                    Version::Lsn(Lsn(0x50)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 2 at 5")
@@ -1722,14 +1790,26 @@ mod tests {
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    0,
+                    Version::Lsn(Lsn(0x70)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             ZERO_PAGE
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    1,
+                    Version::Lsn(Lsn(0x70)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 1")
@@ -1750,7 +1830,13 @@ mod tests {
         for blk in 2..1500 {
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx)
+                    .get_rel_page_at_lsn(
+                        TESTREL_A,
+                        blk,
+                        Version::Lsn(Lsn(0x80)),
+                        &ctx,
+                        io_concurrency.clone()
+                    )
                     .instrument(test_span.clone())
                     .await?,
                 ZERO_PAGE
@@ -1758,7 +1844,13 @@ mod tests {
         }
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    1500,
+                    Version::Lsn(Lsn(0x80)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 1500")
@@ -1851,6 +1943,7 @@ mod tests {
             .await?
             .load()
             .await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -1903,7 +1996,13 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx)
+                    .get_rel_page_at_lsn(
+                        TESTREL_A,
+                        blkno,
+                        Version::Lsn(lsn),
+                        &ctx,
+                        io_concurrency.clone()
+                    )
                     .instrument(test_span.clone())
                     .await?,
                 test_img(&data)
@@ -1931,7 +2030,13 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx)
+                    .get_rel_page_at_lsn(
+                        TESTREL_A,
+                        blkno,
+                        Version::Lsn(Lsn(0x60)),
+                        &ctx,
+                        io_concurrency.clone()
+                    )
                     .instrument(test_span.clone())
                     .await?,
                 test_img(&data)
@@ -1950,7 +2055,13 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx)
+                    .get_rel_page_at_lsn(
+                        TESTREL_A,
+                        blkno,
+                        Version::Lsn(Lsn(0x50)),
+                        &ctx,
+                        io_concurrency.clone()
+                    )
                     .instrument(test_span.clone())
                     .await?,
                 test_img(&data)
@@ -1987,7 +2098,13 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx)
+                    .get_rel_page_at_lsn(
+                        TESTREL_A,
+                        blkno,
+                        Version::Lsn(Lsn(0x80)),
+                        &ctx,
+                        io_concurrency.clone()
+                    )
                     .instrument(test_span.clone())
                     .await?,
                 test_img(&data)
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index 6c22b31e00..c82c7578d1 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -208,6 +208,10 @@ class ShardIndex:
             shard_count=int(input[2:4], 16),
         )
 
+    @property
+    def is_sharded(self) -> bool:
+        return self.shard_count != 0
+
 
 class TenantShardId:
     def __init__(self, tenant_id: TenantId, shard_number: int, shard_count: int):
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index fa541bad17..fd7e193778 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -126,12 +126,8 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
     "pageserver_page_cache_read_accesses_total",
     "pageserver_page_cache_size_current_bytes",
     "pageserver_page_cache_size_max_bytes",
-    "pageserver_getpage_reconstruct_seconds_bucket",
-    "pageserver_getpage_reconstruct_seconds_count",
-    "pageserver_getpage_reconstruct_seconds_sum",
     *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
     *histogram("pageserver_smgr_query_seconds_global"),
-    *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
     *histogram("pageserver_wait_lsn_seconds"),
     *histogram("pageserver_remote_operation_seconds"),
     *histogram("pageserver_io_operations_seconds"),
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d79c2a5ea8..af427b92d2 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -313,6 +313,10 @@ class PgProtocol:
         """
         return self.safe_psql(query, log_query=log_query)[0][0]
 
+    def show_timeline_id(self) -> TimelineId:
+        """SHOW neon.timeline_id"""
+        return TimelineId(cast("str", self.safe_psql("show neon.timeline_id")[0][0]))
+
 
 class PageserverWalReceiverProtocol(StrEnum):
     VANILLA = "vanilla"
@@ -387,6 +391,7 @@ class NeonEnvBuilder:
         storage_controller_port_override: int | None = None,
         pageserver_virtual_file_io_mode: str | None = None,
         pageserver_wal_receiver_protocol: PageserverWalReceiverProtocol | None = None,
+        pageserver_get_vectored_concurrent_io: str | None = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -426,6 +431,9 @@ class NeonEnvBuilder:
         self.storage_controller_config: dict[Any, Any] | None = None
 
         self.pageserver_virtual_file_io_engine: str | None = pageserver_virtual_file_io_engine
+        self.pageserver_get_vectored_concurrent_io: str | None = (
+            pageserver_get_vectored_concurrent_io
+        )
 
         self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = (
             pageserver_default_tenant_config_compaction_algorithm
@@ -452,6 +460,7 @@ class NeonEnvBuilder:
         self.test_name = test_name
         self.compatibility_neon_binpath = compatibility_neon_binpath
         self.compatibility_pg_distrib_dir = compatibility_pg_distrib_dir
+        self.test_may_use_compatibility_snapshot_binaries = False
         self.version_combination = combination
         self.mixdir = self.test_output_dir / "mixdir_neon"
         if self.version_combination is not None:
@@ -463,6 +472,7 @@ class NeonEnvBuilder:
             ), "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required when using mixed versions"
             self.mixdir.mkdir(mode=0o755, exist_ok=True)
             self._mix_versions()
+            self.test_may_use_compatibility_snapshot_binaries = True
 
     def init_configs(self, default_remote_storage_if_missing: bool = True) -> NeonEnv:
         # Cannot create more than one environment from one builder
@@ -1062,6 +1072,7 @@ class NeonEnv:
         self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
         self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode
         self.pageserver_wal_receiver_protocol = config.pageserver_wal_receiver_protocol
+        self.pageserver_get_vectored_concurrent_io = config.pageserver_get_vectored_concurrent_io
 
         # Create the neon_local's `NeonLocalInitConf`
         cfg: dict[str, Any] = {
@@ -1121,6 +1132,20 @@ class NeonEnv:
                 "max_batch_size": 32,
             }
 
+            # Concurrent IO (https://github.com/neondatabase/neon/issues/9378):
+            # enable concurrent IO by default in tests and benchmarks.
+            # Compat tests are exempt because old versions fail to parse the new config.
+            get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
+            if config.test_may_use_compatibility_snapshot_binaries:
+                log.info(
+                    "Forcing use of binary-built-in default to avoid forward-compatibility related test failures"
+                )
+                get_vectored_concurrent_io = None
+            if get_vectored_concurrent_io is not None:
+                ps_cfg["get_vectored_concurrent_io"] = {
+                    "mode": self.pageserver_get_vectored_concurrent_io,
+                }
+
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
             if config.pageserver_default_tenant_config_compaction_algorithm is not None:
@@ -1455,6 +1480,7 @@ def neon_simple_env(
     pageserver_virtual_file_io_engine: str,
     pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None,
     pageserver_virtual_file_io_mode: str | None,
+    pageserver_get_vectored_concurrent_io: str | None,
 ) -> Iterator[NeonEnv]:
     """
     Simple Neon environment, with 1 safekeeper and 1 pageserver. No authentication, no fsync.
@@ -1487,6 +1513,7 @@ def neon_simple_env(
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
         pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
         pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
+        pageserver_get_vectored_concurrent_io=pageserver_get_vectored_concurrent_io,
         combination=combination,
     ) as builder:
         env = builder.init_start()
@@ -1513,6 +1540,7 @@ def neon_env_builder(
     pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None,
     record_property: Callable[[str, object], None],
     pageserver_virtual_file_io_mode: str | None,
+    pageserver_get_vectored_concurrent_io: str | None,
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.
@@ -1555,6 +1583,7 @@ def neon_env_builder(
         test_overlay_dir=test_overlay_dir,
         pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
         pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
+        pageserver_get_vectored_concurrent_io=pageserver_get_vectored_concurrent_io,
     ) as builder:
         yield builder
         # Propogate `preserve_database_files` to make it possible to use in other fixtures,
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index f57c0f801f..3404c16f55 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -44,6 +44,11 @@ def pageserver_virtual_file_io_mode() -> str | None:
     return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE")
 
 
+@pytest.fixture(scope="function", autouse=True)
+def pageserver_get_vectored_concurrent_io() -> str | None:
+    return os.getenv("PAGESERVER_GET_VECTORED_CONCURRENT_IO")
+
+
 def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None:
     toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
     if toml_table is None:
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index a6eaaf6c4c..ac44630d30 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -251,6 +251,8 @@ def test_forward_compatibility(
         os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
     )
 
+    neon_env_builder.test_may_use_compatibility_snapshot_binaries = True
+
     try:
         neon_env_builder.num_safekeepers = 3
 

From c283aaaf8d66dd04ce463733cf6545269f70f4c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Wed, 22 Jan 2025 17:09:41 +0100
Subject: [PATCH 26/37] Tag images from docker-hub in promote-images-prod
 (#10475)

## Problem

https://github.com/neondatabase/neon/actions/runs/12896686483/job/35961290336#step:5:107
showed that `promote-images-prod` was missing another dependency.

## Summary of changes
Modify `promote-images-prod` to tag based on docker-hub images, so that
`promote-images-prod` does not rely on `promote-images-dev`. The result
should be the exact same, but allows the two jobs to run in parallel.
---
 .github/workflows/build_and_test.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 4fc81dccaa..b1230879d3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -892,14 +892,14 @@ jobs:
         run: |
           for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
             docker buildx imagetools create -t $repo/neon:latest \
-                                               $repo/neon:${{ needs.tag.outputs.build-tag }}
+                                               neondatabase/neon:${{ needs.tag.outputs.build-tag }}
 
             for version in ${VERSIONS}; do
               docker buildx imagetools create -t $repo/compute-node-${version}:latest \
-                                                 $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+                                                 neondatabase/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
 
               docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
-                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+                                                 neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
             done
           done
           docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \

From f1473dd438ebfeb2eeae7f9d619cd0478c47e470 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 22 Jan 2025 17:34:57 +0100
Subject: [PATCH 27/37] Fix the connection error for extension tests (#10480)

## Problem
The trust connection to the compute required for `pg_anon` was removed.
However, the PGPASSWORD environment variable was not added to
`docker-compose.yml`.
This caused connection errors, which were interpreted as success due to
errors in the bash script.
## Summary of changes
The environment variable was added, and the logic in the bash script was
fixed.
---
 docker-compose/docker-compose.yml | 2 ++
 docker-compose/run-tests.sh       | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 6e15fdbe0d..4f0a887c27 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -185,6 +185,8 @@ services:
   neon-test-extensions:
     profiles: ["test-extensions"]
     image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
+    environment:
+      - PGPASSWORD=cloud_admin
     entrypoint:
       - "/bin/bash"
       - "-c"
diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh
index 3fc0b90071..9873187b62 100644
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -7,7 +7,10 @@ LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
 for d in ${LIST}
 do
        [ -d "${d}" ] || continue
-    psql -c "select 1" >/dev/null || break
+       if ! psql -w -c "select 1" >/dev/null; then
+          FAILED="${d} ${FAILED}"
+          break
+       fi
        USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
 done
 [ -z "${FAILED}" ] && exit 0

From c60b91369aef527f63aebc99d1e6e148dbece506 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 22 Jan 2025 19:52:16 +0100
Subject: [PATCH 28/37] Expose safekeeper APIs for creation and deletion
 (#10478)

Add APIs for timeline creation and deletion to the safekeeper client
crate. Going to be used later in #10440.

Split off from #10440.

Part of https://github.com/neondatabase/neon/issues/9011
---
 safekeeper/client/src/mgmt_api.rs | 32 ++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index 5727f32509..f65bfaa6d5 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -4,7 +4,7 @@
 //! united.
 
 use reqwest::{IntoUrl, Method, StatusCode};
-use safekeeper_api::models::TimelineStatus;
+use safekeeper_api::models::{TimelineCreateRequest, TimelineStatus};
 use std::error::Error as _;
 use utils::{
     http::error::HttpErrorBody,
@@ -76,6 +76,28 @@ impl Client {
         }
     }
 
+    pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result<TimelineStatus> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}",
+            self.mgmt_api_endpoint, req.tenant_id, req.timeline_id
+        );
+        let resp = self.post(&uri, req).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+
+    pub async fn delete_timeline(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<TimelineStatus> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}",
+            self.mgmt_api_endpoint, tenant_id, timeline_id
+        );
+        let resp = self.request(Method::DELETE, &uri, ()).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+
     pub async fn timeline_status(
         &self,
         tenant_id: TenantId,
@@ -107,6 +129,14 @@ impl Client {
         self.get(&uri).await
     }
 
+    async fn post<B: serde::Serialize, U: IntoUrl>(
+        &self,
+        uri: U,
+        body: B,
+    ) -> Result<reqwest::Response> {
+        self.request(Method::POST, uri, body).await
+    }
+
     async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
         self.request(Method::GET, uri, ()).await
     }

From 0af40b5494a56ca19b982ad7b53045768fd79eb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 22 Jan 2025 20:45:12 +0100
Subject: [PATCH 29/37] Only churn rows once in
 test_scrubber_physical_gc_ancestors (#10481)

## Problem

PR #10457 was supposed to fix the flakiness of
`test_scrubber_physical_gc_ancestors`, but instead it made it even more
flaky. However, the original error causes disappeared, now to be
replaced by key not found errors.

See this for a longer explanation:
https://github.com/neondatabase/neon/issues/10391#issuecomment-2608018967

## Solution

This does one churn rows after all compactions, and before we do any
timeline gc's. That way, we remain more accessible at older lsn's.
---
 test_runner/regress/test_storage_scrubber.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index a782e85567..1304d302b7 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -271,8 +271,14 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
         ps.http_client().timeline_compact(
             shard, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True
         )
-        # Add some WAL so that we don't gc at the latest remote consistent lsn
-        workload.churn_rows(1)
+
+    # Add some WAL so that we don't gc at the latest remote consistent lsn
+    workload.churn_rows(10)
+
+    # Now gc the old stuff away
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        assert ps is not None
         ps.http_client().timeline_gc(shard, timeline_id, 0)
 
     # We will use a min_age_secs=1 threshold for deletion, let it pass

From 92d95b08cfba6973bc735538fe5778f40b0dd45c Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 22 Jan 2025 19:15:46 -0500
Subject: [PATCH 30/37] fix(pageserver): extend split job key range to the end
 (#10484)

## Problem

Not really a bug fix, but hopefully can reproduce
https://github.com/neondatabase/neon/issues/10482 more.

If the layer map does not contain layers that end at exactly the end
range of the compaction job, the current split algorithm will produce
the last job that ends at the maximum layer key. This patch extends it
all the way to the compaction job end key.

For example, the user requests a compaction of 0000...FFFF. However, we
only have a layer 0000..3000 in the layer map, and the split job will
have a range of 0000..3000 instead of 0000..FFFF.

This is not a correctness issue but it would be better to fix it so that
we can get consistent job splits.

## Summary of changes

Compaction job split will always cover the full specified key range.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 6 ++++++
 test_runner/regress/test_compaction.py       | 9 +++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 57fc415d06..4d5dc2d8a9 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -2212,6 +2212,12 @@ impl Timeline {
                 } else {
                     end
                 };
+                let end = if ranges_num == idx + 1 {
+                    // extend the compaction range to the end of the key range if it's the last partition
+                    end.max(job.compact_key_range.end)
+                } else {
+                    end
+                };
                 info!(
                     "splitting compaction job: {}..{}, estimated_size={}",
                     start, end, total_size
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index d0a2349ccf..fde26e1533 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -150,8 +150,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
     child_workloads: list[Workload] = []
 
     for i in range(1, churn_rounds + 1):
-        if i % 10 == 0:
-            log.info(f"Running churn round {i}/{churn_rounds} ...")
+        log.info(f"Running churn round {i}/{churn_rounds} ...")
         if i % 10 == 5 and with_branches == "with_branches":
             branch_name = f"child-{i}"
             branch_timeline_id = env.create_branch(branch_name)
@@ -172,8 +171,10 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
                     "sub_compaction_max_job_size_mb": 16,
                 },
             )
-
-        workload.churn_rows(row_count, env.pageserver.id)
+        # do not wait for upload so that we can see if gc_compaction works well with data being ingested
+        workload.churn_rows(row_count, env.pageserver.id, upload=False)
+        time.sleep(1)
+        workload.validate(env.pageserver.id)
 
     def compaction_finished():
         queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id))

From 8e8df1b4539403b294a02332bbc14252b81b3cc9 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 23 Jan 2025 11:02:15 +0000
Subject: [PATCH 31/37] Disable logical replication subscribers (#10249)

Drop logical replication subscribers
before compute starts on a non-main branch.

Add new compute_ctl spec flag: drop_subscriptions_before_start
If it is set, drop all the subscriptions from the compute node
before it starts.

To avoid race on compute start, use new GUC
neon.disable_logical_replication_subscribers
to temporarily disable logical replication workers until we drop the
subscriptions.

Ensure that we drop subscriptions exactly once when endpoint starts on a
new branch.
It is essential, because otherwise, we may drop not only inherited, but
newly created subscriptions.

We cannot rely only on spec.drop_subscriptions_before_start flag,
because if for some reason compute restarts inside VM,
it will start again with the same spec and flag value.

To handle this, we save the fact of the operation in the database
in the neon.drop_subscriptions_done table.
If the table does not exist, we assume that the operation was never
performed, so we must do it.
If table exists, we check if the operation was performed on the current
timeline.

fixes: https://github.com/neondatabase/neon/issues/8790
---
 compute_tools/src/compute.rs                  | 104 ++++++--
 compute_tools/src/config.rs                   |   7 +
 compute_tools/src/spec_apply.rs               |  24 +-
 ...or_drop_dbs.sql => drop_subscriptions.sql} |   0
 .../src/sql/finalize_drop_subscriptions.sql   |  21 ++
 control_plane/src/bin/neon_local.rs           |   1 +
 control_plane/src/endpoint.rs                 |   7 +
 libs/compute_api/src/spec.rs                  |   7 +
 pgxn/neon/neon.c                              |  10 +
 test_runner/fixtures/neon_cli.py              |   3 +
 test_runner/fixtures/neon_fixtures.py         |   2 +
 .../regress/test_subscriber_branching.py      | 242 ++++++++++++++++++
 vendor/postgres-v14                           |   2 +-
 vendor/postgres-v15                           |   2 +-
 vendor/postgres-v16                           |   2 +-
 vendor/postgres-v17                           |   2 +-
 vendor/revisions.json                         |   8 +-
 17 files changed, 413 insertions(+), 31 deletions(-)
 rename compute_tools/src/sql/{drop_subscription_for_drop_dbs.sql => drop_subscriptions.sql} (100%)
 create mode 100644 compute_tools/src/sql/finalize_drop_subscriptions.sql
 create mode 100644 test_runner/regress/test_subscriber_branching.py

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1ac97a378b..fd76e404c6 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -41,14 +41,14 @@ use crate::local_proxy;
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::spec_apply::ApplySpecPhase::{
-    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSuperUser,
-    DropInvalidDatabases, DropRoles, HandleNeonExtension, HandleOtherExtensions,
-    RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
+    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon,
+    CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
+    HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
+    RunInEachDatabase,
 };
 use crate::spec_apply::PerDatabasePhase;
 use crate::spec_apply::PerDatabasePhase::{
-    ChangeSchemaPerms, DeleteDBRoleReferences, DropSubscriptionsForDeletedDatabases,
-    HandleAnonExtension,
+    ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension,
 };
 use crate::spec_apply::{apply_operations, MutableApplyContext, DB};
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -340,6 +340,15 @@ impl ComputeNode {
         self.state.lock().unwrap().status
     }
 
+    pub fn get_timeline_id(&self) -> Option<TimelineId> {
+        self.state
+            .lock()
+            .unwrap()
+            .pspec
+            .as_ref()
+            .map(|s| s.timeline_id)
+    }
+
     // Remove `pgdata` directory and create it again with right permissions.
     fn create_pgdata(&self) -> Result<()> {
         // Ignore removal error, likely it is a 'No such file or directory (os error 2)'.
@@ -929,6 +938,48 @@ impl ComputeNode {
                 .map(|role| (role.name.clone(), role))
                 .collect::<HashMap<String, Role>>();
 
+            // Check if we need to drop subscriptions before starting the endpoint.
+            //
+            // It is important to do this operation exactly once when endpoint starts on a new branch.
+            // Otherwise, we may drop not inherited, but newly created subscriptions.
+            //
+            // We cannot rely only on spec.drop_subscriptions_before_start flag,
+            // because if for some reason compute restarts inside VM,
+            // it will start again with the same spec and flag value.
+            //
+            // To handle this, we save the fact of the operation in the database
+            // in the neon.drop_subscriptions_done table.
+            // If the table does not exist, we assume that the operation was never performed, so we must do it.
+            // If table exists, we check if the operation was performed on the current timelilne.
+            //
+            let mut drop_subscriptions_done = false;
+
+            if spec.drop_subscriptions_before_start {
+                let timeline_id = self.get_timeline_id().context("timeline_id must be set")?;
+                let query = format!("select 1 from neon.drop_subscriptions_done where timeline_id = '{}'", timeline_id);
+
+                info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id);
+
+                drop_subscriptions_done =  match
+                    client.simple_query(&query).await {
+                    Ok(result) => {
+                        matches!(&result[0], postgres::SimpleQueryMessage::Row(_))
+                    },
+                    Err(e) =>
+                    {
+                        match e.code() {
+                            Some(&SqlState::UNDEFINED_TABLE) => false,
+                            _ => {
+                                // We don't expect any other error here, except for the schema/table not existing
+                                error!("Error checking if drop subscription operation was already performed: {}", e);
+                                return Err(e.into());
+                            }
+                        }
+                    }
+                }
+            };
+
+
             let jwks_roles = Arc::new(
                 spec.as_ref()
                     .local_proxy_config
@@ -996,7 +1047,7 @@ impl ComputeNode {
                         jwks_roles.clone(),
                         concurrency_token.clone(),
                         db,
-                        [DropSubscriptionsForDeletedDatabases].to_vec(),
+                        [DropLogicalSubscriptions].to_vec(),
                     );
 
                     Ok(spawn(fut))
@@ -1024,6 +1075,7 @@ impl ComputeNode {
                 CreateAndAlterRoles,
                 RenameAndDeleteDatabases,
                 CreateAndAlterDatabases,
+                CreateSchemaNeon,
             ] {
                 info!("Applying phase {:?}", &phase);
                 apply_operations(
@@ -1064,6 +1116,17 @@ impl ComputeNode {
                     }
 
                     let conf = Arc::new(conf);
+                    let mut phases = vec![
+                        DeleteDBRoleReferences,
+                        ChangeSchemaPerms,
+                        HandleAnonExtension,
+                    ];
+
+                    if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
+                        info!("Adding DropLogicalSubscriptions phase because drop_subscriptions_before_start is set");
+                        phases.push(DropLogicalSubscriptions);
+                    }
+
                     let fut = Self::apply_spec_sql_db(
                         spec.clone(),
                         conf,
@@ -1071,12 +1134,7 @@ impl ComputeNode {
                         jwks_roles.clone(),
                         concurrency_token.clone(),
                         db,
-                        [
-                            DeleteDBRoleReferences,
-                            ChangeSchemaPerms,
-                            HandleAnonExtension,
-                        ]
-                        .to_vec(),
+                        phases,
                     );
 
                     Ok(spawn(fut))
@@ -1088,12 +1146,20 @@ impl ComputeNode {
                 handle.await??;
             }
 
-            for phase in vec![
+            let mut phases = vec![
                 HandleOtherExtensions,
-                HandleNeonExtension,
+                HandleNeonExtension, // This step depends on CreateSchemaNeon
                 CreateAvailabilityCheck,
                 DropRoles,
-            ] {
+            ];
+
+            // This step depends on CreateSchemaNeon
+            if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
+                info!("Adding FinalizeDropLogicalSubscriptions phase because drop_subscriptions_before_start is set");
+                phases.push(FinalizeDropLogicalSubscriptions);
+            }
+
+            for phase in phases {
                 debug!("Applying phase {:?}", &phase);
                 apply_operations(
                     spec.clone(),
@@ -1463,6 +1529,14 @@ impl ComputeNode {
                         Ok(())
                     },
                 )?;
+
+                let postgresql_conf_path = pgdata_path.join("postgresql.conf");
+                if config::line_in_file(
+                    &postgresql_conf_path,
+                    "neon.disable_logical_replication_subscribers=false",
+                )? {
+                    info!("updated postgresql.conf to set neon.disable_logical_replication_subscribers=false");
+                }
                 self.pg_reload_conf()?;
             }
             self.post_apply_config()?;
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index b257c8a68f..e1bdfffa54 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -129,6 +129,13 @@ pub fn write_postgres_conf(
 
     writeln!(file, "neon.extension_server_port={}", extension_server_port)?;
 
+    if spec.drop_subscriptions_before_start {
+        writeln!(file, "neon.disable_logical_replication_subscribers=true")?;
+    } else {
+        // be explicit about the default value
+        writeln!(file, "neon.disable_logical_replication_subscribers=false")?;
+    }
+
     // This is essential to keep this line at the end of the file,
     // because it is intended to override any settings above.
     writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs
index 7401de2e60..5ee9c5fbd8 100644
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -47,7 +47,7 @@ pub enum PerDatabasePhase {
     DeleteDBRoleReferences,
     ChangeSchemaPerms,
     HandleAnonExtension,
-    DropSubscriptionsForDeletedDatabases,
+    DropLogicalSubscriptions,
 }
 
 #[derive(Clone, Debug)]
@@ -58,11 +58,13 @@ pub enum ApplySpecPhase {
     CreateAndAlterRoles,
     RenameAndDeleteDatabases,
     CreateAndAlterDatabases,
+    CreateSchemaNeon,
     RunInEachDatabase { db: DB, subphase: PerDatabasePhase },
     HandleOtherExtensions,
     HandleNeonExtension,
     CreateAvailabilityCheck,
     DropRoles,
+    FinalizeDropLogicalSubscriptions,
 }
 
 pub struct Operation {
@@ -331,7 +333,7 @@ async fn get_operations<'a>(
                             // NB: there could be other db states, which prevent us from dropping
                             // the database. For example, if db is used by any active subscription
                             // or replication slot.
-                            // Such cases are handled in the DropSubscriptionsForDeletedDatabases
+                            // Such cases are handled in the DropLogicalSubscriptions
                             // phase. We do all the cleanup before actually dropping the database.
                             let drop_db_query: String = format!(
                                 "DROP DATABASE IF EXISTS {} WITH (FORCE)",
@@ -442,13 +444,19 @@ async fn get_operations<'a>(
 
             Ok(Box::new(operations))
         }
+        ApplySpecPhase::CreateSchemaNeon => Ok(Box::new(once(Operation {
+            query: String::from("CREATE SCHEMA IF NOT EXISTS neon"),
+            comment: Some(String::from(
+                "create schema for neon extension and utils tables",
+            )),
+        }))),
         ApplySpecPhase::RunInEachDatabase { db, subphase } => {
             match subphase {
-                PerDatabasePhase::DropSubscriptionsForDeletedDatabases => {
+                PerDatabasePhase::DropLogicalSubscriptions => {
                     match &db {
                         DB::UserDB(db) => {
                             let drop_subscription_query: String = format!(
-                                include_str!("sql/drop_subscription_for_drop_dbs.sql"),
+                                include_str!("sql/drop_subscriptions.sql"),
                                 datname_str = escape_literal(&db.name),
                             );
 
@@ -666,10 +674,6 @@ async fn get_operations<'a>(
         }
         ApplySpecPhase::HandleNeonExtension => {
             let operations = vec![
-                Operation {
-                    query: String::from("CREATE SCHEMA IF NOT EXISTS neon"),
-                    comment: Some(String::from("init: add schema for extension")),
-                },
                 Operation {
                     query: String::from("CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon"),
                     comment: Some(String::from(
@@ -712,5 +716,9 @@ async fn get_operations<'a>(
 
             Ok(Box::new(operations))
         }
+        ApplySpecPhase::FinalizeDropLogicalSubscriptions => Ok(Box::new(once(Operation {
+            query: String::from(include_str!("sql/finalize_drop_subscriptions.sql")),
+            comment: None,
+        }))),
     }
 }
diff --git a/compute_tools/src/sql/drop_subscription_for_drop_dbs.sql b/compute_tools/src/sql/drop_subscriptions.sql
similarity index 100%
rename from compute_tools/src/sql/drop_subscription_for_drop_dbs.sql
rename to compute_tools/src/sql/drop_subscriptions.sql
diff --git a/compute_tools/src/sql/finalize_drop_subscriptions.sql b/compute_tools/src/sql/finalize_drop_subscriptions.sql
new file mode 100644
index 0000000000..4bb291876f
--- /dev/null
+++ b/compute_tools/src/sql/finalize_drop_subscriptions.sql
@@ -0,0 +1,21 @@
+DO $$
+BEGIN
+    IF NOT EXISTS(
+        SELECT 1
+        FROM pg_catalog.pg_tables
+        WHERE tablename = 'drop_subscriptions_done'
+        AND schemaname = 'neon'
+    )
+    THEN
+        CREATE TABLE neon.drop_subscriptions_done
+        (id serial primary key, timeline_id text);
+    END IF;
+
+    -- preserve the timeline_id of the last drop_subscriptions run
+    -- to ensure that the cleanup of a timeline is executed only once.
+    -- use upsert to avoid the table bloat in case of cascade branching (branch of a branch)
+    INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id'))
+    ON CONFLICT (id) DO UPDATE
+    SET timeline_id = current_setting('neon.timeline_id');
+END
+$$
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index c73debae4c..ba67ffa2dd 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1357,6 +1357,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                 args.pg_version,
                 mode,
                 !args.update_catalog,
+                false,
             )?;
         }
         EndpointCmd::Start(args) => {
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index b8027abf7c..bc86d09103 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -76,6 +76,7 @@ pub struct EndpointConf {
     http_port: u16,
     pg_version: u32,
     skip_pg_catalog_updates: bool,
+    drop_subscriptions_before_start: bool,
     features: Vec<ComputeFeature>,
 }
 
@@ -143,6 +144,7 @@ impl ComputeControlPlane {
         pg_version: u32,
         mode: ComputeMode,
         skip_pg_catalog_updates: bool,
+        drop_subscriptions_before_start: bool,
     ) -> Result<Arc<Endpoint>> {
         let pg_port = pg_port.unwrap_or_else(|| self.get_port());
         let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
@@ -162,6 +164,7 @@ impl ComputeControlPlane {
             // with this we basically test a case of waking up an idle compute, where
             // we also skip catalog updates in the cloud.
             skip_pg_catalog_updates,
+            drop_subscriptions_before_start,
             features: vec![],
         });
 
@@ -177,6 +180,7 @@ impl ComputeControlPlane {
                 pg_port,
                 pg_version,
                 skip_pg_catalog_updates,
+                drop_subscriptions_before_start,
                 features: vec![],
             })?,
         )?;
@@ -240,6 +244,7 @@ pub struct Endpoint {
     // Optimizations
     skip_pg_catalog_updates: bool,
 
+    drop_subscriptions_before_start: bool,
     // Feature flags
     features: Vec<ComputeFeature>,
 }
@@ -291,6 +296,7 @@ impl Endpoint {
             tenant_id: conf.tenant_id,
             pg_version: conf.pg_version,
             skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
+            drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
             features: conf.features,
         })
     }
@@ -625,6 +631,7 @@ impl Endpoint {
             shard_stripe_size: Some(shard_stripe_size),
             local_proxy_config: None,
             reconfigure_concurrency: 1,
+            drop_subscriptions_before_start: self.drop_subscriptions_before_start,
         };
         let spec_path = self.endpoint_path().join("spec.json");
         std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 54d6a1d38f..b3f18dc6da 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -138,6 +138,13 @@ pub struct ComputeSpec {
     /// enough spare connections for reconfiguration process to succeed.
     #[serde(default = "default_reconfigure_concurrency")]
     pub reconfigure_concurrency: usize,
+
+    /// If set to true, the compute_ctl will drop all subscriptions before starting the
+    /// compute. This is needed when we start an endpoint on a branch, so that child
+    /// would not compete with parent branch subscriptions
+    /// over the same replication content from publisher.
+    #[serde(default)] // Default false
+    pub drop_subscriptions_before_start: bool,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index ff08f9164d..ce2938cfd5 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -19,6 +19,7 @@
 #include "access/xlogrecovery.h"
 #endif
 #include "replication/logical.h"
+#include "replication/logicallauncher.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "storage/proc.h"
@@ -434,6 +435,15 @@ _PG_init(void)
 
 	restore_running_xacts_callback = RestoreRunningXactsFromClog;
 
+	DefineCustomBoolVariable(
+							"neon.disable_logical_replication_subscribers",
+							"Disables incomming logical replication",
+							NULL,
+							&disable_logical_replication_subscribers,
+							false,
+							PGC_SIGHUP,
+							0,
+							NULL, NULL, NULL);
 
 	DefineCustomBoolVariable(
 							"neon.allow_replica_misconfig",
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index adbd6414a7..33d422c590 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -523,6 +523,7 @@ class NeonLocalCli(AbstractNeonCli):
         remote_ext_config: str | None = None,
         pageserver_id: int | None = None,
         allow_multiple: bool = False,
+        create_test_user: bool = False,
         basebackup_request_tries: int | None = None,
         env: dict[str, str] | None = None,
     ) -> subprocess.CompletedProcess[str]:
@@ -544,6 +545,8 @@ class NeonLocalCli(AbstractNeonCli):
             args.extend(["--pageserver-id", str(pageserver_id)])
         if allow_multiple:
             args.extend(["--allow-multiple"])
+        if create_test_user:
+            args.extend(["--create-test-user"])
 
         res = self.raw_cli(args, extra_env_vars)
         res.check_returncode()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index af427b92d2..388c1eb046 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3918,6 +3918,7 @@ class Endpoint(PgProtocol, LogUtils):
         pageserver_id: int | None = None,
         safekeepers: list[int] | None = None,
         allow_multiple: bool = False,
+        create_test_user: bool = False,
         basebackup_request_tries: int | None = None,
         env: dict[str, str] | None = None,
     ) -> Self:
@@ -3939,6 +3940,7 @@ class Endpoint(PgProtocol, LogUtils):
             remote_ext_config=remote_ext_config,
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
+            create_test_user=create_test_user,
             basebackup_request_tries=basebackup_request_tries,
             env=env,
         )
diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py
new file mode 100644
index 0000000000..645572da8e
--- /dev/null
+++ b/test_runner/regress/test_subscriber_branching.py
@@ -0,0 +1,242 @@
+from __future__ import annotations
+
+import time
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
+from fixtures.utils import query_scalar, wait_until
+
+
+# This test checks that branching of timeline with logical subscriptions
+# does not affect logical replication for parent.
+# Endpoint on a new branch will drop all existing subscriptions at the start,
+# so it will not receive any changes.
+# If needed, user can create new subscriptions on the child branch.
+def test_subscriber_branching(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.create_branch("publisher")
+    pub = env.endpoints.create("publisher")
+    pub.respec(
+        skip_pg_catalog_updates=False,
+        create_test_user=True,
+    )
+    pub.start(create_test_user=True)
+
+    env.create_branch("subscriber")
+    sub = env.endpoints.create("subscriber")
+    # Pass create_test_user flag to get properly filled spec.users and spec.databases fields.
+    #
+    # This test checks the per-database operations that happen at compute start
+    # and these operations are applied to the databases that are present in the spec.
+    sub.respec(
+        skip_pg_catalog_updates=False,
+        create_test_user=True,
+    )
+    sub.start(create_test_user=True)
+
+    pub.wait_for_migrations()
+    sub.wait_for_migrations()
+
+    n_records = 1000
+
+    def check_that_changes_propagated():
+        scur.execute("SELECT count(*) FROM t")
+        res = scur.fetchall()
+        assert res[0][0] == n_records
+
+    def insert_data(pub, start):
+        with pub.cursor(dbname="neondb", user="test", password="pubtestpwd") as pcur:
+            for i in range(start, start + n_records):
+                pcur.execute("INSERT into t values (%s,random()*100000)", (i,))
+
+    # create_test_user creates a user without password
+    # but psycopg2 execute() requires a password
+    with sub.cursor() as scur:
+        scur.execute("ALTER USER test WITH PASSWORD 'testpwd'")
+    with pub.cursor() as pcur:
+        # Create a test user to avoid using superuser
+        pcur.execute("ALTER USER test WITH PASSWORD 'pubtestpwd'")
+        # If we don't do this, creating the subscription will fail
+        pub.edit_hba(["host all test 0.0.0.0/0 md5"])
+
+    with pub.cursor(dbname="neondb", user="test", password="pubtestpwd") as pcur:
+        pcur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
+        pcur.execute("CREATE PUBLICATION pub FOR TABLE t")
+
+        with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            scur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
+            pub_conn = (
+                f"host=localhost port={pub.pg_port} dbname=neondb user=test password=pubtestpwd"
+            )
+            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
+            scur.execute(query)
+            time.sleep(2)  # let initial table sync complete
+
+        insert_data(pub, 0)
+
+        with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            wait_until(check_that_changes_propagated)
+            latest_end_lsn = query_scalar(
+                scur, "select latest_end_lsn from pg_catalog.pg_stat_subscription; "
+            )
+            last_insert_lsn = query_scalar(scur, "select pg_current_wal_insert_lsn();")
+
+            log.info(f"latest_end_lsn = {latest_end_lsn}")
+            log.info(f"last_insert_lsn = {last_insert_lsn}")
+
+        # stop the parent subscriber so that it doesn't interfere with the test
+        sub.stop()
+
+        # 1. good scenario:
+        # create subscriber_child_1
+        # it will not get changes from publisher, because drop_subscriptions_before_start is set to True
+        sub_child_1_timeline_id = env.create_branch(
+            "subscriber_child_1",
+            ancestor_branch_name="subscriber",
+            ancestor_start_lsn=last_insert_lsn,
+        )
+        sub_child_1 = env.endpoints.create("subscriber_child_1")
+        # Pass drop_subscriptions_before_start flag
+        sub_child_1.respec(
+            skip_pg_catalog_updates=False,
+            create_test_user=True,
+            drop_subscriptions_before_start=True,
+        )
+        sub_child_1.start(create_test_user=True)
+
+        # ensure that subscriber_child_1 sees all the data
+        with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            scur.execute("SELECT count(*) FROM t")
+            res = scur.fetchall()
+            assert res[0][0] == n_records
+
+            # ensure that there are no subscriptions in this database
+            scur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub'")
+            assert len(scur.fetchall()) == 0
+
+        # ensure that drop_subscriptions_done happened on this timeline
+        with sub_child_1.cursor() as scur_postgres:
+            scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done")
+            res = scur_postgres.fetchall()
+            assert len(res) == 1
+            assert str(sub_child_1_timeline_id) == res[0][0]
+
+        old_n_records = n_records
+        # insert more data on publisher
+        insert_data(pub, n_records)
+        n_records += n_records
+
+        pcur.execute("SELECT count(*) FROM t")
+        res = pcur.fetchall()
+        assert res[0][0] == n_records
+
+        # ensure that subscriber_child_1 doesn't see the new data
+        with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            scur.execute("SELECT count(*) FROM t")
+            res = scur.fetchall()
+            assert res[0][0] == old_n_records
+
+        # reenable logical replication on subscriber_child_1
+        # using new publication
+        # ensure that new publication works as expected
+        with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            scur.execute("TRUNCATE t")
+
+            # create new subscription
+            # with new pub name
+            pcur.execute("CREATE PUBLICATION pub_new FOR TABLE t")
+            query = f"CREATE SUBSCRIPTION sub_new CONNECTION '{pub_conn}' PUBLICATION pub_new"
+            scur.execute(query)
+
+            wait_until(check_that_changes_propagated)
+
+            scur.execute("SELECT count(*) FROM t")
+            res = scur.fetchall()
+            assert res[0][0] == n_records
+
+        # ensure that new publication works as expected after compute restart
+        # first restart with drop_subscriptions_before_start=True
+        # to emulate the case when compute restarts within the VM with stale spec
+        sub_child_1.stop()
+        sub_child_1.respec(
+            skip_pg_catalog_updates=False,
+            create_test_user=True,
+            drop_subscriptions_before_start=True,
+        )
+        sub_child_1.start(create_test_user=True)
+
+        with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            # ensure that even though the flag is set, we didn't drop new subscription
+            scur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub_new'")
+            assert len(scur.fetchall()) == 1
+
+        # ensure that drop_subscriptions_done happened on this timeline
+        with sub_child_1.cursor() as scur_postgres:
+            scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done")
+            res = scur_postgres.fetchall()
+            assert len(res) == 1
+            assert str(sub_child_1_timeline_id) == res[0][0]
+
+        sub_child_1.stop()
+        sub_child_1.respec(
+            skip_pg_catalog_updates=False,
+            create_test_user=True,
+            drop_subscriptions_before_start=False,
+        )
+        sub_child_1.start(create_test_user=True)
+
+        # insert more data on publisher
+        insert_data(pub, n_records)
+        n_records += n_records
+        with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            # ensure that there is a subscriptions in this database
+            scur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub_new'")
+            assert len(scur.fetchall()) == 1
+
+            wait_until(check_that_changes_propagated)
+            scur.execute("SELECT count(*) FROM t")
+            res = scur.fetchall()
+            assert res[0][0] == n_records
+
+        # ensure that drop_subscriptions_done happened on this timeline
+        with sub_child_1.cursor() as scur_postgres:
+            scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done")
+            res = scur_postgres.fetchall()
+            assert len(res) == 1
+            assert str(sub_child_1_timeline_id) == res[0][0]
+
+        # wake the sub and ensure that it catches up with the new data
+        sub.start(create_test_user=True)
+        with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            logical_replication_sync(sub, pub)
+            wait_until(check_that_changes_propagated)
+            scur.execute("SELECT count(*) FROM t")
+            res = scur.fetchall()
+            assert res[0][0] == n_records
+
+        # test that we can create a branch of a branch
+        sub_child_2_timeline_id = env.create_branch(
+            "subscriber_child_2",
+            ancestor_branch_name="subscriber_child_1",
+        )
+        sub_child_2 = env.endpoints.create("subscriber_child_2")
+        # Pass drop_subscriptions_before_start flag
+        sub_child_2.respec(
+            skip_pg_catalog_updates=False,
+            drop_subscriptions_before_start=True,
+        )
+        sub_child_2.start(create_test_user=True)
+
+        # ensure that subscriber_child_2 does not inherit subscription from child_1
+        with sub_child_2.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            # ensure that there are no subscriptions in this database
+            scur.execute("SELECT count(*) FROM pg_catalog.pg_subscription")
+            res = scur.fetchall()
+            assert res[0][0] == 0
+
+        # ensure that drop_subscriptions_done happened on this timeline
+        with sub_child_2.cursor() as scur_postgres:
+            scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done")
+            res = scur_postgres.fetchall()
+            assert len(res) == 1
+            assert str(sub_child_2_timeline_id) == res[0][0]
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 46082f2088..5f3b3afdd7 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 46082f20884f087a2d974b33ac65d63af26142bd
+Subproject commit 5f3b3afdd7c24b4a0fd63ecb3288fab472fcc633
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index dd0b28d6fb..935292e883 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit dd0b28d6fbad39e227f3b77296fcca879af8b3a9
+Subproject commit 935292e883298187f112db6e9c7f765037ddcf64
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index d674efd776..061d563779 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit d674efd776f59d78e8fa1535bd2f95c3e6984fca
+Subproject commit 061d56377961ba56998e41b7d5d5e975919ad301
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index a8dd6e779d..4276717f6e 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit a8dd6e779dde907778006adb436b557ad652fb97
+Subproject commit 4276717f6e91023e504de355f4f21d4824074de8
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c899dbaa5a..a104be8ae0 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "a8dd6e779dde907778006adb436b557ad652fb97"
+    "4276717f6e91023e504de355f4f21d4824074de8"
   ],
   "v16": [
     "16.6",
-    "d674efd776f59d78e8fa1535bd2f95c3e6984fca"
+    "061d56377961ba56998e41b7d5d5e975919ad301"
   ],
   "v15": [
     "15.10",
-    "dd0b28d6fbad39e227f3b77296fcca879af8b3a9"
+    "935292e883298187f112db6e9c7f765037ddcf64"
   ],
   "v14": [
     "14.15",
-    "46082f20884f087a2d974b33ac65d63af26142bd"
+    "5f3b3afdd7c24b4a0fd63ecb3288fab472fcc633"
   ]
 }

From 3702ec889faa37859e48a7d170fec59dc76b720b Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Thu, 23 Jan 2025 13:22:31 +0000
Subject: [PATCH 32/37] Enable postgres_fdw (#10426)

Update compute image to include postgres_fdw #3720
---
 compute/compute-node.Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 706c947008..a80c701b45 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -67,6 +67,9 @@ RUN cd postgres && \
     # Enable some of contrib extensions
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgres_fdw.control && \
+    file=/usr/local/pgsql/share/extension/postgres_fdw--1.0.sql && [ -e $file ] && \
+    echo 'GRANT USAGE ON FOREIGN DATA WRAPPER postgres_fdw TO neon_superuser;' >> $file && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \

From b6c0f66619f0117480dd550b99d6cc57ef0192b7 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Thu, 23 Jan 2025 15:52:07 +0100
Subject: [PATCH 33/37] CI(autocomment): add the lfc state (#10121)

## Problem
Currently, the report does not contain the LFC state of the failed
tests.
## Summary of changes
Added the LFC state to the link to the allure report.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 scripts/comment-test-report.js                   | 11 +++++++++--
 scripts/ingest_regress_test_result-new-format.py |  2 +-
 test_runner/fixtures/parametrize.py              |  4 +++-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js
index e8e0b3c23a..96a0ea3267 100755
--- a/scripts/comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -84,6 +84,12 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => {
                 } else {
                     arch = "unknown"
                 }
+                let lfcState = ""
+                if (test.parameters.includes("'with-lfc'")) {
+                    lfcState = "with-lfc"
+                } else {
+                    lfcState = "without-lfc"
+                }
 
                 // Removing build type and PostgreSQL version from the test name to make it shorter
                 const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "")
@@ -91,6 +97,7 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => {
                 test.pgVersion = pgVersion
                 test.buildType = buildType
                 test.arch = arch
+                test.lfcState = lfcState
 
                 if (test.status === "passed") {
                     passedTests[pgVersion][testName].push(test)
@@ -157,7 +164,7 @@ const reportSummary = async (params) => {
                 const links = []
                 for (const test of tests) {
                     const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
-                    links.push(`[${test.buildType}-${test.arch}](${allureLink})`)
+                    links.push(`[${test.buildType}-${test.arch}-${test.lfcState}](${allureLink})`)
                 }
                 summary += `- \`${testName}\`: ${links.join(", ")}\n`
             }
@@ -188,7 +195,7 @@ const reportSummary = async (params) => {
                     const links = []
                     for (const test of tests) {
                         const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries`
-                        links.push(`[${test.buildType}-${test.arch}](${allureLink})`)
+                        links.push(`[${test.buildType}-${test.arch}-${test.lfcState}](${allureLink})`)
                     }
                     summary += `- \`${testName}\`: ${links.join(", ")}\n`
                 }
diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py
index 064c516718..ad2baf56bb 100644
--- a/scripts/ingest_regress_test_result-new-format.py
+++ b/scripts/ingest_regress_test_result-new-format.py
@@ -134,7 +134,7 @@ def ingest_test_result(
             if p["name"].startswith("__")
         }
         arch = parameters.get("arch", "UNKNOWN").strip("'")
-        lfc = parameters.get("lfc", "False") == "True"
+        lfc = parameters.get("lfc", "without-lfc").strip("'") == "with-lfc"
 
         build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
         labels = {label["name"]: label["value"] for label in test["labels"]}
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 3404c16f55..1acb1af23b 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -121,6 +121,8 @@ def pytest_runtest_makereport(*args, **kwargs):
     }.get(os.uname().machine, "UNKNOWN")
     arch = os.getenv("RUNNER_ARCH", uname_m)
     allure.dynamic.parameter("__arch", arch)
-    allure.dynamic.parameter("__lfc", os.getenv("USE_LFC") != "false")
+    allure.dynamic.parameter(
+        "__lfc", "with-lfc" if os.getenv("USE_LFC") != "false" else "without-lfc"
+    )
 
     yield

From ca6d72ba2a54aeb6041af26f4baeaf0e46d5c01b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 23 Jan 2025 17:43:04 +0100
Subject: [PATCH 34/37] Increase reconciler timeout after shard split (#10490)

Sometimes, especially when the host running the tests is overloaded, we
can run into reconcile timeouts in
`test_timeline_ancestor_detach_idempotent_success`, making the test
flaky. By increasing the timeouts from 30 seconds to 120 seconds, we can
address the flakiness.

Fixes #10464
---
 test_runner/regress/test_timeline_detach_ancestor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 5234d8278f..612a767480 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -607,7 +607,7 @@ def test_timeline_ancestor_detach_idempotent_success(
 
     if shards_after > 1:
         # FIXME: should this be in the neon_env_builder.init_start?
-        env.storage_controller.reconcile_until_idle()
+        env.storage_controller.reconcile_until_idle(timeout_secs=120)
         client = env.storage_controller.pageserver_api()
     else:
         client = env.pageserver.http_client()
@@ -636,7 +636,7 @@ def test_timeline_ancestor_detach_idempotent_success(
         # Do a shard split
         # This is a reproducer for https://github.com/neondatabase/neon/issues/9667
         env.storage_controller.tenant_shard_split(env.initial_tenant, shards_after)
-        env.storage_controller.reconcile_until_idle()
+        env.storage_controller.reconcile_until_idle(timeout_secs=120)
 
     first_reparenting_response = client.detach_ancestor(env.initial_tenant, first_branch)
     assert set(first_reparenting_response) == {reparented1, reparented2}

From 616648258967ac59b5f8e3f83e0b5091e933ff86 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 23 Jan 2025 21:47:20 +0100
Subject: [PATCH 35/37] feat(compute): Automatically create release PRs
 (#10495)

We've finally transitioned to using a separate `release-compute` branch.
Now, we can finally automatically create release PRs on Fri and release
them during the following week.

Part of neondatabase/cloud#11698
---
 .github/workflows/release.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 3c1af1d9c6..919846ce44 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -3,8 +3,9 @@ name: Create Release Branch
 on:
   schedule:
     # It should be kept in sync with if-condition in jobs
-    - cron: '0 6 * * FRI' # Storage release
     - cron: '0 6 * * THU' # Proxy release
+    - cron: '0 6 * * FRI' # Storage release
+    - cron: '0 7 * * FRI' # Compute release
   workflow_dispatch:
     inputs:
       create-storage-release-branch:
@@ -55,7 +56,7 @@ jobs:
       ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
 
   create-compute-release-branch:
-    if: inputs.create-compute-release-branch
+    if: ${{ github.event.schedule == '0 7 * * FRI' || inputs.create-compute-release-branch }}
 
     permissions:
       contents: write

From 8d47a60de27ef8163fdc63cdc91cf3e807c8a14e Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 23 Jan 2025 16:54:44 -0500
Subject: [PATCH 36/37] fix(pageserver): handle dup layers during gc-compaction
 (#10430)

## Problem

If gc-compaction decides to rewrite an image layer, it will now cause
index_part to lose reference to that layer. In details,

* Assume there's only one image layer of key 0000...AAAA at LSN 0x100
and generation 0xA in the system.
* gc-compaction kicks in at gc-horizon 0x100, and then produce
0000...AAAA at LSN 0x100 and generation 0xB.
* It submits a compaction result update into the index part that unlinks
0000-AAAA-100-A and adds 0000-AAAA-100-B

On the remote storage / local disk side, this is fine -- it unlinks
things correctly and uploads the new file. However, the
`index_part.json` itself doesn't record generations. The buggy procedure
is as follows:

1. upload the new file
2. update the index part to remove the old file and add the new file
3. remove the new file

Therefore, the correct update result process for gc-compaction should be
as follows:

* When modifying the layer map, delete the old one and upload the new
one.
* When updating the index, uploading the new one in the index without
deleting the old one.

## Summary of changes

* Modify `finish_gc_compaction` to correctly order insertions and
deletions.
* Update the way gc-compaction uploads the layer files.
* Add new tests.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs  |  46 +++-
 .../src/tenant/timeline/layer_manager.rs      |  39 ++-
 test_runner/regress/test_compaction.py        | 239 +++++++++++++++++-
 3 files changed, 309 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 4d5dc2d8a9..28c3381318 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -436,12 +436,14 @@ impl KeyHistoryRetention {
         if dry_run {
             return true;
         }
-        let guard = tline.layers.read().await;
-        if !guard.contains_key(key) {
-            return false;
+        let layer_generation;
+        {
+            let guard = tline.layers.read().await;
+            if !guard.contains_key(key) {
+                return false;
+            }
+            layer_generation = guard.get_from_key(key).metadata().generation;
         }
-        let layer_generation = guard.get_from_key(key).metadata().generation;
-        drop(guard);
         if layer_generation == tline.generation {
             info!(
                 key=%key,
@@ -2138,6 +2140,11 @@ impl Timeline {
             self.get_gc_compaction_watermark()
         };
 
+        if compact_below_lsn == Lsn::INVALID {
+            tracing::warn!("no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction");
+            return Ok(vec![]);
+        }
+
         // Split compaction job to about 4GB each
         const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024;
         let sub_compaction_max_job_size_mb =
@@ -2338,6 +2345,11 @@ impl Timeline {
                 // each of the retain_lsn. Therefore, if the user-provided `compact_lsn_range.end` is larger than the real gc cutoff, we will use
                 // the real cutoff.
                 let mut gc_cutoff = if compact_lsn_range.end == Lsn::MAX {
+                    if real_gc_cutoff == Lsn::INVALID {
+                        // If the gc_cutoff is not generated yet, we should not compact anything.
+                        tracing::warn!("no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction");
+                        return Ok(());
+                    }
                     real_gc_cutoff
                 } else {
                     compact_lsn_range.end
@@ -2869,7 +2881,7 @@ impl Timeline {
             "produced {} delta layers and {} image layers, {} layers are kept",
             produced_delta_layers_len,
             produced_image_layers_len,
-            layer_selection.len()
+            keep_layers.len()
         );
 
         // Step 3: Place back to the layer map.
@@ -2915,8 +2927,28 @@ impl Timeline {
         // be batched into `schedule_compaction_update`.
         let disk_consistent_lsn = self.disk_consistent_lsn.load();
         self.schedule_uploads(disk_consistent_lsn, None)?;
+        // If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead
+        // of `compact_from`.
+        let compact_from = {
+            let mut compact_from = Vec::new();
+            let mut compact_to_set = HashMap::new();
+            for layer in &compact_to {
+                compact_to_set.insert(layer.layer_desc().key(), layer);
+            }
+            for layer in &layer_selection {
+                if let Some(to) = compact_to_set.get(&layer.layer_desc().key()) {
+                    tracing::info!(
+                        "skipping delete {} because found same layer key at different generation {}",
+                        layer, to
+                    );
+                } else {
+                    compact_from.push(layer.clone());
+                }
+            }
+            compact_from
+        };
         self.remote_client
-            .schedule_compaction_update(&layer_selection, &compact_to)?;
+            .schedule_compaction_update(&compact_from, &compact_to)?;
 
         drop(gc_lock);
 
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 3888e7f86a..f1cef7778c 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -337,16 +337,45 @@ impl OpenLayerManager {
         compact_to: &[ResidentLayer],
         metrics: &TimelineMetrics,
     ) {
-        // We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification.
-        self.finish_compact_l0(compact_from, compact_to, metrics)
+        // gc-compaction could contain layer rewrites. We need to delete the old layers and insert the new ones.
+
+        // Match the old layers with the new layers
+        let mut add_layers = HashMap::new();
+        let mut rewrite_layers = HashMap::new();
+        let mut drop_layers = HashMap::new();
+        for layer in compact_from {
+            drop_layers.insert(layer.layer_desc().key(), layer.clone());
+        }
+        for layer in compact_to {
+            if let Some(old_layer) = drop_layers.remove(&layer.layer_desc().key()) {
+                rewrite_layers.insert(layer.layer_desc().key(), (old_layer.clone(), layer.clone()));
+            } else {
+                add_layers.insert(layer.layer_desc().key(), layer.clone());
+            }
+        }
+        let add_layers = add_layers.values().cloned().collect::<Vec<_>>();
+        let drop_layers = drop_layers.values().cloned().collect::<Vec<_>>();
+        let rewrite_layers = rewrite_layers.values().cloned().collect::<Vec<_>>();
+
+        self.rewrite_layers_inner(&rewrite_layers, &drop_layers, &add_layers, metrics);
     }
 
     /// Called post-compaction when some previous generation image layers were trimmed.
-    pub(crate) fn rewrite_layers(
+    pub fn rewrite_layers(
         &mut self,
         rewrite_layers: &[(Layer, ResidentLayer)],
         drop_layers: &[Layer],
         metrics: &TimelineMetrics,
+    ) {
+        self.rewrite_layers_inner(rewrite_layers, drop_layers, &[], metrics);
+    }
+
+    fn rewrite_layers_inner(
+        &mut self,
+        rewrite_layers: &[(Layer, ResidentLayer)],
+        drop_layers: &[Layer],
+        add_layers: &[ResidentLayer],
+        metrics: &TimelineMetrics,
     ) {
         let mut updates = self.layer_map.batch_update();
         for (old_layer, new_layer) in rewrite_layers {
@@ -382,6 +411,10 @@ impl OpenLayerManager {
         for l in drop_layers {
             Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
         }
+        for l in add_layers {
+            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            metrics.record_new_file_metrics(l.layer_desc().file_size);
+        }
         updates.flush();
     }
 
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index fde26e1533..2edfc884ad 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 import json
+import math
+import random
 import time
 from enum import StrEnum
 
@@ -128,11 +130,6 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
     }
 
     env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
-    env.pageserver.allowed_errors.append(
-        r".*failed to acquire partition lock during gc-compaction.*"
-    )
-    env.pageserver.allowed_errors.append(r".*repartition() called concurrently.*")
-
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
@@ -147,6 +144,10 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
     log.info("Writing initial data ...")
     workload.write_rows(row_count, env.pageserver.id)
 
+    ps_http.timeline_gc(
+        tenant_id, timeline_id, None
+    )  # Force refresh gc info to have gc_cutoff generated
+
     child_workloads: list[Workload] = []
 
     for i in range(1, churn_rounds + 1):
@@ -198,6 +199,230 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
     ps_http.timeline_gc(tenant_id, timeline_id, None)
 
 
+@pytest.mark.parametrize(
+    "compaction_mode",
+    ["before_restart", "after_restart"],
+)
+def test_pageserver_gc_compaction_idempotent(
+    neon_env_builder: NeonEnvBuilder, compaction_mode: str
+):
+    """
+    Do gc-compaction twice without writing any new data and see if anything breaks.
+    We run this test in two modes:
+    - before_restart: run two gc-compactions before pageserver restart
+    - after_restart: run one gc-compaction before and one after pageserver restart
+    """
+    SMOKE_CONF = {
+        # Run both gc and gc-compaction.
+        "gc_period": "5s",
+        "compaction_period": "5s",
+        # No PiTR interval and small GC horizon
+        "pitr_interval": "0s",
+        "gc_horizon": 1024,
+        "lsn_lease_length": "0s",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Only in testing mode: the warning is expected because we rewrite a layer file of different generations.
+    # We could potentially patch the sanity-check code to not emit the warning in the future.
+    env.pageserver.allowed_errors.append(".*was unlinked but was not dangling.*")
+
+    row_count = 10000
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    workload.write_rows(row_count, env.pageserver.id)
+
+    child_workloads: list[Workload] = []
+
+    def compaction_finished():
+        queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id))
+        assert queue_depth == 0
+
+    workload.churn_rows(row_count, env.pageserver.id)
+    env.create_branch("child_branch")  # so that we have a retain_lsn
+    workload.churn_rows(row_count, env.pageserver.id)
+    # compact 3 times if mode is before_restart
+    n_compactions = 3 if compaction_mode == "before_restart" else 1
+    for _ in range(n_compactions):
+        # Force refresh gc info to have gc_cutoff generated
+        ps_http.timeline_gc(tenant_id, timeline_id, None)
+        ps_http.timeline_compact(
+            tenant_id,
+            timeline_id,
+            enhanced_gc_bottom_most_compaction=True,
+            body={
+                "scheduled": True,
+                "sub_compaction": True,
+                "compact_key_range": {
+                    "start": "000000000000000000000000000000000000",
+                    "end": "030000000000000000000000000000000000",
+                },
+                "sub_compaction_max_job_size_mb": 16,
+            },
+        )
+        wait_until(compaction_finished, timeout=60)
+    if compaction_mode == "after_restart":
+        env.pageserver.restart(True)
+        ps_http.timeline_gc(
+            tenant_id, timeline_id, None
+        )  # Force refresh gc info to have gc_cutoff generated
+        for _ in range(3):
+            ps_http.timeline_compact(
+                tenant_id,
+                timeline_id,
+                enhanced_gc_bottom_most_compaction=True,
+                body={
+                    "scheduled": True,
+                    "sub_compaction": True,
+                    "compact_key_range": {
+                        "start": "000000000000000000000000000000000000",
+                        "end": "030000000000000000000000000000000000",
+                    },
+                    "sub_compaction_max_job_size_mb": 16,
+                },
+            )
+            wait_until(compaction_finished, timeout=60)
+
+    # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
+    env.pageserver.assert_log_contains(
+        "scheduled_compact_timeline.*picked .* layers for compaction"
+    )
+
+    # ensure we hit the duplicated layer key warning at least once: we did two compactions consecutively,
+    # and the second one should have hit the duplicated layer key warning.
+    if compaction_mode == "before_restart":
+        env.pageserver.assert_log_contains("duplicated layer key in the same generation")
+    else:
+        env.pageserver.assert_log_contains("same layer key at different generation")
+
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+    for child_workload in child_workloads:
+        log.info(f"Validating at branch {child_workload.branch_name}")
+        child_workload.validate(env.pageserver.id)
+
+    # Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction.
+    ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
+    ps_http.timeline_gc(tenant_id, timeline_id, None)
+
+
+@skip_in_debug_build("only run with release build")
+def test_pageserver_gc_compaction_interrupt(neon_env_builder: NeonEnvBuilder):
+    """
+    Force interrupt a gc-compaction and see if anything breaks.
+    """
+    SMOKE_CONF = {
+        # Run both gc and gc-compaction.
+        "gc_period": "5s",
+        "compaction_period": "5s",
+        # No PiTR interval and small GC horizon
+        "pitr_interval": "0s",
+        "gc_horizon": "1024",
+        "lsn_lease_length": "0s",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Only in testing mode: the warning is expected because we rewrite a layer file of different generations.
+    # We could potentially patch the sanity-check code to not emit the warning in the future.
+    env.pageserver.allowed_errors.append(".*was unlinked but was not dangling.*")
+
+    row_count = 10000
+    churn_rounds = 20
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    log.info("Writing initial data ...")
+    workload.write_rows(row_count, env.pageserver.id)
+
+    def compaction_finished():
+        queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id))
+        assert queue_depth == 0
+
+    expected_compaction_time_seconds = 5.0
+    ps_http.timeline_gc(
+        tenant_id, timeline_id, None
+    )  # Force refresh gc info to have gc_cutoff generated
+    for i in range(1, churn_rounds + 1):
+        log.info(f"Running churn round {i}/{churn_rounds} ...")
+        workload.churn_rows(row_count, env.pageserver.id)
+        ps_http.timeline_compact(
+            tenant_id,
+            timeline_id,
+            enhanced_gc_bottom_most_compaction=True,
+            body={
+                "scheduled": True,
+                "sub_compaction": True,
+                "compact_key_range": {
+                    "start": "000000000000000000000000000000000000",
+                    "end": "030000000000000000000000000000000000",
+                },
+                "sub_compaction_max_job_size_mb": 16,
+            },
+        )
+        # sleep random seconds between 0 and max(compaction_time); if the result is 0, wait until the compaction is complete
+        # This would hopefully trigger the restart at different periods of the compaction:
+        # - while we are doing the compaction
+        # - while we finished the compaction but not yet uploaded the metadata
+        # - after we uploaded the metadata
+        time_to_sleep = random.randint(0, max(5, math.ceil(expected_compaction_time_seconds)))
+        if time_to_sleep == 0 or i == 1:
+            start = time.time()
+            wait_until(compaction_finished, timeout=60)
+            end = time.time()
+            expected_compaction_time_seconds = end - start
+            log.info(
+                f"expected_compaction_time_seconds updated to {expected_compaction_time_seconds} seconds"
+            )
+        else:
+            time.sleep(time_to_sleep)
+        env.pageserver.restart(True)
+        ps_http.timeline_gc(
+            tenant_id, timeline_id, None
+        )  # Force refresh gc info to have gc_cutoff generated
+        ps_http.timeline_compact(
+            tenant_id,
+            timeline_id,
+            enhanced_gc_bottom_most_compaction=True,
+            body={
+                "scheduled": True,
+                "sub_compaction": True,
+                "compact_key_range": {
+                    "start": "000000000000000000000000000000000000",
+                    "end": "030000000000000000000000000000000000",
+                },
+                "sub_compaction_max_job_size_mb": 16,
+            },
+        )
+        workload.validate(env.pageserver.id)
+
+    wait_until(compaction_finished, timeout=60)
+
+    # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
+    env.pageserver.assert_log_contains(
+        "scheduled_compact_timeline.*picked .* layers for compaction"
+    )
+
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+
+    # Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction.
+    ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
+    ps_http.timeline_gc(tenant_id, timeline_id, None)
+
+
 # Stripe sizes in number of pages.
 TINY_STRIPES = 16
 LARGE_STRIPES = 32768
@@ -238,7 +463,9 @@ def test_sharding_compaction(
         "pitr_interval": "0s",
         # disable background compaction and GC. We invoke it manually when we want it to happen.
         "gc_period": "0s",
+        "gc_horizon": f"{128 * 1024}",
         "compaction_period": "0s",
+        "lsn_lease_length": "0s",
         # create image layers eagerly: we want to exercise image layer creation in this test.
         "image_creation_threshold": "1",
         "image_layer_creation_check_threshold": 0,
@@ -313,6 +540,8 @@ def test_sharding_compaction(
         for shard in env.storage_controller.locate(tenant_id):
             pageserver = env.get_pageserver(shard["node_id"])
             tenant_shard_id = shard["shard_id"]
+            # Force refresh gc info to have gc_cutoff generated
+            pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None)
             pageserver.http_client().timeline_compact(
                 tenant_shard_id,
                 timeline_id,

From f35e1356a1730ee86430d86824bc972d3bc44375 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 24 Jan 2025 06:02:13 +0000
Subject: [PATCH 37/37] Storage release 2025-01-24