From b76ab45cbe10e54678ed24094e81f0addcd788c2 Mon Sep 17 00:00:00 2001
From: dotdister <odsk.dr@gmail.com>
Date: Mon, 5 Aug 2024 16:23:59 +0900
Subject: [PATCH 01/48] safekeeper: remove unused partial_backup_enabled option
 (#8547)

## Problem
There is an unused safekeeper option `partial_backup_enabled`.

`partial_backup_enabled` was implemented in #6530, but this option was
always turned into enabled in #8022.

If you intended to keep this option for a specific reason, I will close
this PR.

## Summary of changes
I removed an unused safekeeper option `partial_backup_enabled`.
---
 safekeeper/src/bin/safekeeper.rs               | 6 ------
 safekeeper/src/lib.rs                          | 2 --
 safekeeper/src/timeline_manager.rs             | 4 ++--
 safekeeper/tests/walproposer_sim/safekeeper.rs | 1 -
 4 files changed, 2 insertions(+), 11 deletions(-)
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 2365fd0587..41c2d3fe08 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -170,11 +170,6 @@ struct Args {
     /// still needed for existing replication connection.
     #[arg(long)]
     walsenders_keep_horizon: bool,
-    /// Enable partial backup. If disabled, safekeeper will not upload partial
-    /// segments to remote storage.
-    /// TODO: now partial backup is always enabled, remove this flag.
-    #[arg(long)]
-    partial_backup_enabled: bool,
     /// Controls how long backup will wait until uploading the partial segment.
     #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
     partial_backup_timeout: Duration,
@@ -347,7 +342,6 @@ async fn main() -> anyhow::Result<()> {
         sk_auth_token,
         current_thread_runtime: args.current_thread_runtime,
         walsenders_keep_horizon: args.walsenders_keep_horizon,
-        partial_backup_enabled: true,
         partial_backup_timeout: args.partial_backup_timeout,
         disable_periodic_broker_push: args.disable_periodic_broker_push,
         enable_offload: args.enable_offload,
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 56d61e8287..2e11a279ca 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -93,7 +93,6 @@ pub struct SafeKeeperConf {
     pub sk_auth_token: Option<SecretString>,
     pub current_thread_runtime: bool,
     pub walsenders_keep_horizon: bool,
-    pub partial_backup_enabled: bool,
     pub partial_backup_timeout: Duration,
     pub disable_periodic_broker_push: bool,
     pub enable_offload: bool,
@@ -137,7 +136,6 @@ impl SafeKeeperConf {
             max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
             current_thread_runtime: false,
             walsenders_keep_horizon: false,
-            partial_backup_enabled: false,
             partial_backup_timeout: Duration::from_secs(0),
             disable_periodic_broker_push: false,
             enable_offload: false,
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index c224dcd398..482614fac7 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -544,8 +544,8 @@ impl Manager {
 
     /// Spawns partial WAL backup task if needed.
     async fn update_partial_backup(&mut self, state: &StateSnapshot) {
-        // check if partial backup is enabled and should be started
-        if !self.conf.is_wal_backup_enabled() || !self.conf.partial_backup_enabled {
+        // check if WAL backup is enabled and should be started
+        if !self.conf.is_wal_backup_enabled() {
             return;
         }
 
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 0c6d97ddfa..771d905c90 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -181,7 +181,6 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         sk_auth_token: None,
         current_thread_runtime: false,
         walsenders_keep_horizon: false,
-        partial_backup_enabled: false,
         partial_backup_timeout: Duration::from_secs(0),
         disable_periodic_broker_push: false,
         enable_offload: false,

From 5945eadd422b3151f142a766434da92825b1741f Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Mon, 5 Aug 2024 18:30:49 +0800
Subject: [PATCH 02/48] feat(pageserver): support split delta layers (#8599)

part of https://github.com/neondatabase/neon/issues/8002

Similar to https://github.com/neondatabase/neon/pull/8574, we add
auto-split support for delta layers. Tests are reused from image layer
split writers.


---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../src/tenant/storage_layer/delta_layer.rs   |  18 ++
 .../src/tenant/storage_layer/split_writer.rs  | 243 ++++++++++++++++--
 2 files changed, 242 insertions(+), 19 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index f9becf53ff..e50fc2a266 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -384,6 +384,9 @@ struct DeltaLayerWriterInner {
     tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,
 
     blob_writer: BlobWriter<true>,
+
+    // Number of key-lsns in the layer.
+    num_keys: usize,
 }
 
 impl DeltaLayerWriterInner {
@@ -425,6 +428,7 @@ impl DeltaLayerWriterInner {
             lsn_range,
             tree: tree_builder,
             blob_writer,
+            num_keys: 0,
         })
     }
 
@@ -475,6 +479,9 @@ impl DeltaLayerWriterInner {
 
         let delta_key = DeltaKey::from_key_lsn(&key, lsn);
         let res = self.tree.append(&delta_key.0, blob_ref.0);
+
+        self.num_keys += 1;
+
         (val, res.map_err(|e| anyhow::anyhow!(e)))
     }
 
@@ -686,6 +693,17 @@ impl DeltaLayerWriter {
             .finish(key_end, timeline, ctx)
             .await
     }
+
+    #[cfg(test)]
+    pub(crate) fn num_keys(&self) -> usize {
+        self.inner.as_ref().unwrap().num_keys
+    }
+
+    #[cfg(test)]
+    pub(crate) fn estimated_size(&self) -> u64 {
+        let inner = self.inner.as_ref().unwrap();
+        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
+    }
 }
 
 impl Drop for DeltaLayerWriter {
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index a4091a890c..a966775f9e 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -1,12 +1,12 @@
-use std::sync::Arc;
+use std::{ops::Range, sync::Arc};
 
 use bytes::Bytes;
 use pageserver_api::key::{Key, KEY_SIZE};
 use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
 
-use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
+use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
 
-use super::{ImageLayerWriter, ResidentLayer};
+use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
 
 /// An image writer that takes images and produces multiple image layers. The interface does not
 /// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
@@ -98,6 +98,107 @@ impl SplitImageLayerWriter {
         generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
         Ok(generated_layers)
     }
+
+    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    #[allow(dead_code)]
+    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
+        Ok((self.generated_layers, self.inner))
+    }
+}
+
+/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
+/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
+/// to be cleaned up).
+#[must_use]
+pub struct SplitDeltaLayerWriter {
+    inner: DeltaLayerWriter,
+    target_layer_size: u64,
+    generated_layers: Vec<ResidentLayer>,
+    conf: &'static PageServerConf,
+    timeline_id: TimelineId,
+    tenant_shard_id: TenantShardId,
+    lsn_range: Range<Lsn>,
+}
+
+impl SplitDeltaLayerWriter {
+    pub async fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_shard_id: TenantShardId,
+        start_key: Key,
+        lsn_range: Range<Lsn>,
+        target_layer_size: u64,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Self> {
+        Ok(Self {
+            target_layer_size,
+            inner: DeltaLayerWriter::new(
+                conf,
+                timeline_id,
+                tenant_shard_id,
+                start_key,
+                lsn_range.clone(),
+                ctx,
+            )
+            .await?,
+            generated_layers: Vec::new(),
+            conf,
+            timeline_id,
+            tenant_shard_id,
+            lsn_range,
+        })
+    }
+
+    pub async fn put_value(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: Value,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
+        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
+        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
+        if self.inner.num_keys() >= 1
+            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        {
+            let next_delta_writer = DeltaLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                key,
+                self.lsn_range.clone(),
+                ctx,
+            )
+            .await?;
+            let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
+            self.generated_layers
+                .push(prev_delta_writer.finish(key, tline, ctx).await?);
+        }
+        self.inner.put_value(key, lsn, val, ctx).await
+    }
+
+    pub(crate) async fn finish(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        end_key: Key,
+    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        let Self {
+            mut generated_layers,
+            inner,
+            ..
+        } = self;
+        generated_layers.push(inner.finish(end_key, tline, ctx).await?);
+        Ok(generated_layers)
+    }
+
+    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    #[allow(dead_code)]
+    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
+        Ok((self.generated_layers, self.inner))
+    }
 }
 
 #[cfg(test)]
@@ -138,7 +239,7 @@ mod tests {
             .await
             .unwrap();
 
-        let mut writer = SplitImageLayerWriter::new(
+        let mut image_writer = SplitImageLayerWriter::new(
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
@@ -150,11 +251,42 @@ mod tests {
         .await
         .unwrap();
 
-        writer
+        let mut delta_writer = SplitDeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18)..Lsn(0x20),
+            4 * 1024 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+
+        image_writer
             .put_image(get_key(0), get_img(0), &tline, &ctx)
             .await
             .unwrap();
-        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
+        let layers = image_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
+        assert_eq!(layers.len(), 1);
+
+        delta_writer
+            .put_value(
+                get_key(0),
+                Lsn(0x18),
+                Value::Image(get_img(0)),
+                &tline,
+                &ctx,
+            )
+            .await
+            .unwrap();
+        let layers = delta_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
         assert_eq!(layers.len(), 1);
     }
 
@@ -170,7 +302,7 @@ mod tests {
             .await
             .unwrap();
 
-        let mut writer = SplitImageLayerWriter::new(
+        let mut image_writer = SplitImageLayerWriter::new(
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
@@ -181,26 +313,58 @@ mod tests {
         )
         .await
         .unwrap();
+        let mut delta_writer = SplitDeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18)..Lsn(0x20),
+            4 * 1024 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
         const N: usize = 2000;
         for i in 0..N {
             let i = i as u32;
-            writer
+            image_writer
                 .put_image(get_key(i), get_large_img(), &tline, &ctx)
                 .await
                 .unwrap();
+            delta_writer
+                .put_value(
+                    get_key(i),
+                    Lsn(0x20),
+                    Value::Image(get_large_img()),
+                    &tline,
+                    &ctx,
+                )
+                .await
+                .unwrap();
         }
-        let layers = writer
+        let image_layers = image_writer
             .finish(&tline, &ctx, get_key(N as u32))
             .await
             .unwrap();
-        assert_eq!(layers.len(), N / 512 + 1);
-        for idx in 0..layers.len() {
-            assert_ne!(layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(layers[idx].layer_desc().key_range.end, Key::MAX);
+        let delta_layers = delta_writer
+            .finish(&tline, &ctx, get_key(N as u32))
+            .await
+            .unwrap();
+        assert_eq!(image_layers.len(), N / 512 + 1);
+        assert_eq!(delta_layers.len(), N / 512 + 1);
+        for idx in 0..image_layers.len() {
+            assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
+            assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
+            assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
+            assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
             if idx > 0 {
                 assert_eq!(
-                    layers[idx - 1].layer_desc().key_range.end,
-                    layers[idx].layer_desc().key_range.start
+                    image_layers[idx - 1].layer_desc().key_range.end,
+                    image_layers[idx].layer_desc().key_range.start
+                );
+                assert_eq!(
+                    delta_layers[idx - 1].layer_desc().key_range.end,
+                    delta_layers[idx].layer_desc().key_range.start
                 );
             }
         }
@@ -218,7 +382,7 @@ mod tests {
             .await
             .unwrap();
 
-        let mut writer = SplitImageLayerWriter::new(
+        let mut image_writer = SplitImageLayerWriter::new(
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
@@ -230,15 +394,56 @@ mod tests {
         .await
         .unwrap();
 
-        writer
+        let mut delta_writer = SplitDeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18)..Lsn(0x20),
+            4 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+
+        image_writer
             .put_image(get_key(0), get_img(0), &tline, &ctx)
             .await
             .unwrap();
-        writer
+        image_writer
             .put_image(get_key(1), get_large_img(), &tline, &ctx)
             .await
             .unwrap();
-        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
+        let layers = image_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
+        assert_eq!(layers.len(), 2);
+
+        delta_writer
+            .put_value(
+                get_key(0),
+                Lsn(0x18),
+                Value::Image(get_img(0)),
+                &tline,
+                &ctx,
+            )
+            .await
+            .unwrap();
+        delta_writer
+            .put_value(
+                get_key(1),
+                Lsn(0x1A),
+                Value::Image(get_large_img()),
+                &tline,
+                &ctx,
+            )
+            .await
+            .unwrap();
+        let layers = delta_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
         assert_eq!(layers.len(), 2);
     }
 }

From afdbe0a7d0d688e1c0a49bea93030ba0fa3c962f Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 5 Aug 2024 14:24:54 +0300
Subject: [PATCH 03/48] Update Postgres versions to use smgrexists() instead of
 access() to check if Oid is used (#8597)

## Problem

PR #7992 was merged without correspondent changes in Postgres submodules
and this is why test_oid_overflow.py is failed now.

## Summary of changes

Bump Postgres versions

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index dbd0e6428b..7bbe834c8c 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit dbd0e6428b9274d72a10ac29bd3e3162faf109d4
+Subproject commit 7bbe834c8c2dc37802eca8484311599bc47341f6
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 035b73a9c5..9eba7dd382 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 035b73a9c5998f9a0ef35cc8df1bae680bf770fc
+Subproject commit 9eba7dd382606ffca43aca865f337ec21bcdac73
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index b39f316137..5377f5ed72 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit b39f316137fdd29e2da15d2af2fdd1cfd18163be
+Subproject commit 5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3
diff --git a/vendor/revisions.json b/vendor/revisions.json
index eeebd646f5..570dfc1550 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "b39f316137fdd29e2da15d2af2fdd1cfd18163be"],
-  "v15": ["15.7", "035b73a9c5998f9a0ef35cc8df1bae680bf770fc"],
-  "v14": ["14.12", "dbd0e6428b9274d72a10ac29bd3e3162faf109d4"]
+  "v16": ["16.3", "5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3"],
+  "v15": ["15.7", "9eba7dd382606ffca43aca865f337ec21bcdac73"],
+  "v14": ["14.12", "7bbe834c8c2dc37802eca8484311599bc47341f6"]
 }

From 274c2c40b99fc04372fc9eea57e0ff29d44af3ec Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 5 Aug 2024 12:25:23 +0100
Subject: [PATCH 04/48] CI(trigger-e2e-tests): wait for promote-images job from
 the last commit (#8592)

## Problem

We don't trigger e2e tests for draft PRs, but we do trigger them once a
PR is in the "Ready for review" state.
Sometimes, a PR can be marked as "Ready for review" before we finish
image building. In such cases, triggering e2e tests fails.

## Summary of changes
- Make `trigger-e2e-tests` job poll status of `promote-images` job from
the build-and-test workflow for the last commit. And trigger only if the
status is `success`
- Remove explicit image checking from the workflow
- Add `concurrency` for `triggere-e2e-tests` workflow to make it
possible to cancel jobs in progress (if PR moves from "Draft" to "Ready
for review" several times in a row)
---
 .github/workflows/trigger-e2e-tests.yml | 42 ++++++++++++++++++-------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 77928a343e..0a615b3e37 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -10,11 +10,13 @@ defaults:
   run:
     shell: bash -euxo pipefail {0}
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
 env:
   # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
   E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
 jobs:
   cancel-previous-e2e-tests:
@@ -64,19 +66,35 @@ jobs:
     needs: [ tag ]
     runs-on: ubuntu-22.04
     env:
+      EVENT_ACTION: ${{ github.event.action }}
+      GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
       TAG: ${{ needs.tag.outputs.build-tag }}
     steps:
-      - name: check if ecr image are present
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+      - name: Wait for `promote-images` job to finish
+        # It's important to have a timeout here, the script in the step can run infinitely
+        timeout-minutes: 60
         run: |
-          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
-            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
-            if [ "$OUTPUT" == "" ]; then
-              echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
-              exit 1
-            fi
+          if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then
+            exit 0
+          fi
+
+          # For PRs we use the run id as the tag
+          BUILD_AND_TEST_RUN_ID=${TAG}
+          while true; do
+            conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion')
+            case "$conclusion" in
+              success)
+                break
+                ;;
+              failure | cancelled | skipped)
+                echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..."
+                exit 1
+                ;;
+              *)
+                echo "The 'promote-images' hasn't succeed yet. Waiting..."
+                sleep 60
+                ;;
+            esac
           done
 
       - name: Set e2e-platforms

From 6564afb82256461bfc67077de91151bc94e5e5aa Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 5 Aug 2024 19:47:59 +0100
Subject: [PATCH 05/48] CI(trigger-e2e-tests): fix deadlock with Build and Test
 workflow (#8606)

## Problem

In some cases, a deadlock between `build-and-test` and
`trigger-e2e-tests` workflows can happen:

```
Build and Test

Canceling since a deadlock for concurrency group 'Build and Test-8600/merge-anysha' was detected between 'top level workflow' and 'trigger-e2e-tests'
```

I don't understand the reason completely, probably `${{ github.workflow
}}` got evaluated to the same value and somehow caused the issue.
We don't need to limit concurrency for `trigger-e2e-tests`
workflow.

See
https://neondb.slack.com/archives/C059ZC138NR/p1722869486708179?thread_ts=1722869027.960029&cid=C059ZC138NR
---
 .github/workflows/trigger-e2e-tests.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 0a615b3e37..6fbe785c56 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -10,10 +10,6 @@ defaults:
   run:
     shell: bash -euxo pipefail {0}
 
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  cancel-in-progress: true
-
 env:
   # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
   E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}

From e1339ac9159d5bddcd57a5eb7204bf11f25e2a18 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 5 Aug 2024 23:21:33 +0300
Subject: [PATCH 06/48] fix: allow awaiting logical size for root timelines
 (#8604)

Currently if `GET
/v1/tenant/x/timeline/y?force-await-initial-logical-size=true` is
requested for a root timeline created within the current pageserver
session, the request handler panics hitting the debug assertion. These
timelines will always have an accurate (at initdb import) calculated
logical size. Fix is to never attempt prioritizing timeline size
calculation if we already have an exact value.

Split off from #8528.
---
 pageserver/src/tenant/timeline.rs              | 6 ++++++
 pageserver/src/tenant/timeline/logical_size.rs | 4 ++++
 test_runner/regress/test_timeline_size.py      | 3 +++
 3 files changed, 13 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index be72e15c19..8c80a54bdd 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4727,6 +4727,12 @@ impl Timeline {
             return;
         }
 
+        if self.current_logical_size.current_size().is_exact() {
+            // root timelines are initialized with exact count, but never start the background
+            // calculation
+            return;
+        }
+
         if let Some(await_bg_cancel) = self
             .current_logical_size
             .cancel_wait_for_background_loop_concurrency_limit_semaphore
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
index b0d6c4a27a..f4a4eea54a 100644
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -122,6 +122,10 @@ impl CurrentLogicalSize {
             Self::Exact(_) => Accuracy::Exact,
         }
     }
+
+    pub(crate) fn is_exact(&self) -> bool {
+        matches!(self, Self::Exact(_))
+    }
 }
 
 impl LogicalSize {
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 5e9a42f6b4..1f220eec9e 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -936,6 +936,9 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
+    # just make sure this doesn't hit an assertion
+    client.timeline_detail(tenant_id, timeline_id, force_await_initial_logical_size=True)
+
     # load in some data
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
     endpoint.safe_psql_many(

From 3b4b9c1d0b7d38ddef88cbbcd4765dddd30e01b1 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 5 Aug 2024 23:06:47 +0100
Subject: [PATCH 07/48] CI(benchmarking): set pub/sub projects for LR tests
 (#8483)

## Problem

> Currently, long-running LR tests recreate endpoints every night. We'd
like to have along-running buildup of history to exercise the pageserver
in this case (instead of "unit-testing" the same behavior everynight).

Closes #8317

## Summary of changes
- Update Postgres version for replication tests
- Set `BENCHMARK_PROJECT_ID_PUB`/`BENCHMARK_PROJECT_ID_SUB` env vars to
projects that were created for this purpose

---------

Co-authored-by: Sasha Krassovsky <krassovskysasha@gmail.com>
---
 .github/actionlint.yml                        |  2 ++
 .github/workflows/benchmarking.yml            |  9 ++++--
 test_runner/fixtures/neon_api.py              |  6 ++--
 .../performance/test_logical_replication.py   | 29 ++++++++++++++-----
 4 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 37983798b7..d27fa01efa 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -8,6 +8,8 @@ self-hosted-runner:
     - small-arm64
     - us-east-2
 config-variables:
+  - BENCHMARK_PROJECT_ID_PUB
+  - BENCHMARK_PROJECT_ID_SUB
   - REMOTE_STORAGE_AZURE_CONTAINER
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_UPCOMING_RELEASE_CHANNEL_ID
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index f7ea534fb9..0f4dac841e 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,7 +147,7 @@ jobs:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: 16
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -168,7 +168,7 @@ jobs:
         path: /tmp/neon/
         prefix: latest
 
-    - name: Run benchmark
+    - name: Run Logical Replication benchmarks
       uses: ./.github/actions/run-python-test-set
       with:
         build_type: ${{ env.BUILD_TYPE }}
@@ -176,12 +176,15 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 5400
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
         NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+        BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }}
+        BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }}
 
-    - name: Run benchmark
+    - name: Run Physical Replication benchmarks
       uses: ./.github/actions/run-python-test-set
       with:
         build_type: ${{ env.BUILD_TYPE }}
diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
index 658ed119a1..0636cfad06 100644
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -285,9 +285,9 @@ class NeonApiEndpoint:
             self.project_id = project_id
             eps = neon_api.get_endpoints(project_id)["endpoints"]
             self.endpoint_id = eps[0]["id"]
-            self.connstr = neon_api.get_connection_uri(project_id, endpoint_id=self.endpoint_id)[
-                "uri"
-            ]
+            self.connstr = neon_api.get_connection_uri(
+                project_id, endpoint_id=self.endpoint_id, pooled=False
+            )["uri"]
             pw = self.connstr.split("@")[0].split(":")[-1]
             self.pgbench_env = {
                 "PGHOST": eps[0]["host"],
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 53bb29a659..4b4ffc1fee 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -100,24 +100,32 @@ def test_subscriber_lag(
     pub_connstr = benchmark_project_pub.connstr
     sub_connstr = benchmark_project_sub.connstr
 
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+    if benchmark_project_pub.is_new:
+        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+    if benchmark_project_sub.is_new:
+        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
 
     pub_conn = psycopg2.connect(pub_connstr)
     sub_conn = psycopg2.connect(sub_connstr)
     pub_conn.autocommit = True
     sub_conn.autocommit = True
     with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-        if benchmark_project_pub.is_new:
-            pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
+        pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
+        pub_exists = len(pub_cur.fetchall()) != 0
 
-        if benchmark_project_sub.is_new:
+        if not pub_exists:
+            pub_cur.execute("CREATE PUBLICATION pub1 FOR TABLE pgbench_accounts, pgbench_history")
+
+        sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'")
+        sub_exists = len(sub_cur.fetchall()) != 0
+        if not sub_exists:
             sub_cur.execute("truncate table pgbench_accounts")
             sub_cur.execute("truncate table pgbench_history")
 
-            sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
+            sub_cur.execute(f"CREATE SUBSCRIPTION sub1 CONNECTION '{pub_connstr}' PUBLICATION pub1")
 
         initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
     pub_conn.close()
     sub_conn.close()
 
@@ -195,10 +203,15 @@ def test_publisher_restart(
     pub_conn.autocommit = True
     sub_conn.autocommit = True
     with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-        if benchmark_project_pub.is_new:
+        pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
+        pub_exists = len(pub_cur.fetchall()) != 0
+
+        if not pub_exists:
             pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
 
-        if benchmark_project_sub.is_new:
+        sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'")
+        sub_exists = len(sub_cur.fetchall()) != 0
+        if not sub_exists:
             sub_cur.execute("truncate table pgbench_accounts")
             sub_cur.execute("truncate table pgbench_history")
 

From 83afea3edbc52447025c741cd11fbf95a3c7bdd8 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Tue, 6 Aug 2024 10:07:48 +0800
Subject: [PATCH 08/48] feat(pageserver): support dry-run for gc-compaction,
 add statistics (#8557)

Add dry-run mode that does not produce any image layer + delta layer. I
will use this code to do some experiments and see how much space we can
reclaim for tenants on staging. Part of
https://github.com/neondatabase/neon/issues/8002

* Add dry-run mode that runs the full compaction process without
updating the layer map. (We never call finish on the writers and the
files will be removed before exiting the function).
* Add compaction statistics and print them at the end of compaction.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                      |  56 +++++--
 .../src/tenant/storage_layer/image_layer.rs   |   8 +
 pageserver/src/tenant/timeline.rs             |   1 +
 pageserver/src/tenant/timeline/compaction.rs  | 151 +++++++++++++++++-
 4 files changed, 204 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 84c5095610..72d3aedd05 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -6899,7 +6899,10 @@ mod tests {
         }
 
         let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
 
         for (idx, expected) in expected_result.iter().enumerate() {
             assert_eq!(
@@ -6993,7 +6996,10 @@ mod tests {
             guard.cutoffs.time = Lsn(0x40);
             guard.cutoffs.space = Lsn(0x40);
         }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
 
         Ok(())
     }
@@ -7327,7 +7333,10 @@ mod tests {
         }
 
         let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
 
         for idx in 0..10 {
             assert_eq!(
@@ -7353,7 +7362,10 @@ mod tests {
             guard.cutoffs.time = Lsn(0x40);
             guard.cutoffs.space = Lsn(0x40);
         }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
 
         Ok(())
     }
@@ -7898,11 +7910,28 @@ mod tests {
         verify_result().await;
 
         let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        let mut dryrun_flags = EnumSet::new();
+        dryrun_flags.insert(CompactFlags::DryRun);
+
+        tline
+            .compact_with_gc(&cancel, dryrun_flags, &ctx)
+            .await
+            .unwrap();
+        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
+        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
+        verify_result().await;
+
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
         verify_result().await;
 
         // compact again
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
         verify_result().await;
 
         // increase GC horizon and compact again
@@ -7912,11 +7941,17 @@ mod tests {
             guard.cutoffs.time = Lsn(0x38);
             guard.cutoffs.space = Lsn(0x38);
         }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
         verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
 
         // not increasing the GC horizon and compact again
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
         verify_result().await;
 
         Ok(())
@@ -8097,7 +8132,10 @@ mod tests {
         verify_result().await;
 
         let cancel = CancellationToken::new();
-        branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        branch_tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
 
         verify_result().await;
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index aa308ba3c1..f4f48aaf16 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -753,6 +753,10 @@ struct ImageLayerWriterInner {
 }
 
 impl ImageLayerWriterInner {
+    fn size(&self) -> u64 {
+        self.tree.borrow_writer().size() + self.blob_writer.size()
+    }
+
     ///
     /// Start building a new image layer.
     ///
@@ -1044,6 +1048,10 @@ impl ImageLayerWriter {
             .finish(timeline, ctx, Some(end_key))
             .await
     }
+
+    pub(crate) fn size(&self) -> u64 {
+        self.inner.as_ref().unwrap().size()
+    }
 }
 
 impl Drop for ImageLayerWriter {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8c80a54bdd..5c268bf875 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -704,6 +704,7 @@ pub(crate) enum CompactFlags {
     ForceRepartition,
     ForceImageLayerCreation,
     EnhancedGcBottomMostCompaction,
+    DryRun,
 }
 
 impl std::fmt::Debug for Timeline {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 421f718ad6..1ff029a313 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -19,8 +19,10 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
+use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
+use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;
@@ -41,6 +43,7 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 
 use crate::keyspace::KeySpace;
 use crate::repository::{Key, Value};
+use crate::walrecord::NeonWalRecord;
 
 use utils::lsn::Lsn;
 
@@ -73,6 +76,7 @@ impl KeyHistoryRetention {
         key: Key,
         delta_writer: &mut Vec<(Key, Lsn, Value)>,
         mut image_writer: Option<&mut ImageLayerWriter>,
+        stat: &mut CompactionStatistics,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut first_batch = true;
@@ -82,6 +86,7 @@ impl KeyHistoryRetention {
                     let Value::Image(img) = &logs[0].1 else {
                         unreachable!()
                     };
+                    stat.produce_image_key(img);
                     if let Some(image_writer) = image_writer.as_mut() {
                         image_writer.put_image(key, img.clone(), ctx).await?;
                     } else {
@@ -89,24 +94,111 @@ impl KeyHistoryRetention {
                     }
                 } else {
                     for (lsn, val) in logs {
+                        stat.produce_key(&val);
                         delta_writer.push((key, lsn, val));
                     }
                 }
                 first_batch = false;
             } else {
                 for (lsn, val) in logs {
+                    stat.produce_key(&val);
                     delta_writer.push((key, lsn, val));
                 }
             }
         }
         let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
         for (lsn, val) in above_horizon_logs {
+            stat.produce_key(&val);
             delta_writer.push((key, lsn, val));
         }
         Ok(())
     }
 }
 
+#[derive(Debug, Serialize, Default)]
+struct CompactionStatisticsNumSize {
+    num: u64,
+    size: u64,
+}
+
+#[derive(Debug, Serialize, Default)]
+pub struct CompactionStatistics {
+    delta_layer_visited: CompactionStatisticsNumSize,
+    image_layer_visited: CompactionStatisticsNumSize,
+    delta_layer_produced: CompactionStatisticsNumSize,
+    image_layer_produced: CompactionStatisticsNumSize,
+    num_delta_layer_discarded: usize,
+    num_image_layer_discarded: usize,
+    num_unique_keys_visited: usize,
+    wal_keys_visited: CompactionStatisticsNumSize,
+    image_keys_visited: CompactionStatisticsNumSize,
+    wal_produced: CompactionStatisticsNumSize,
+    image_produced: CompactionStatisticsNumSize,
+}
+
+impl CompactionStatistics {
+    fn estimated_size_of_value(val: &Value) -> usize {
+        match val {
+            Value::Image(img) => img.len(),
+            Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
+            _ => std::mem::size_of::<NeonWalRecord>(),
+        }
+    }
+    fn estimated_size_of_key() -> usize {
+        KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer)
+    }
+    fn visit_delta_layer(&mut self, size: u64) {
+        self.delta_layer_visited.num += 1;
+        self.delta_layer_visited.size += size;
+    }
+    fn visit_image_layer(&mut self, size: u64) {
+        self.image_layer_visited.num += 1;
+        self.image_layer_visited.size += size;
+    }
+    fn on_unique_key_visited(&mut self) {
+        self.num_unique_keys_visited += 1;
+    }
+    fn visit_wal_key(&mut self, val: &Value) {
+        self.wal_keys_visited.num += 1;
+        self.wal_keys_visited.size +=
+            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn visit_image_key(&mut self, val: &Value) {
+        self.image_keys_visited.num += 1;
+        self.image_keys_visited.size +=
+            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn produce_key(&mut self, val: &Value) {
+        match val {
+            Value::Image(img) => self.produce_image_key(img),
+            Value::WalRecord(_) => self.produce_wal_key(val),
+        }
+    }
+    fn produce_wal_key(&mut self, val: &Value) {
+        self.wal_produced.num += 1;
+        self.wal_produced.size +=
+            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn produce_image_key(&mut self, val: &Bytes) {
+        self.image_produced.num += 1;
+        self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn discard_delta_layer(&mut self) {
+        self.num_delta_layer_discarded += 1;
+    }
+    fn discard_image_layer(&mut self) {
+        self.num_image_layer_discarded += 1;
+    }
+    fn produce_delta_layer(&mut self, size: u64) {
+        self.delta_layer_produced.num += 1;
+        self.delta_layer_produced.size += size;
+    }
+    fn produce_image_layer(&mut self, size: u64) {
+        self.image_layer_produced.num += 1;
+        self.image_layer_produced.size += size;
+    }
+}
+
 impl Timeline {
     /// TODO: cancellation
     ///
@@ -118,12 +210,18 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<bool, CompactionError> {
         if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
-            self.compact_with_gc(cancel, ctx)
+            self.compact_with_gc(cancel, flags, ctx)
                 .await
                 .map_err(CompactionError::Other)?;
             return Ok(false);
         }
 
+        if flags.contains(CompactFlags::DryRun) {
+            return Err(CompactionError::Other(anyhow!(
+                "dry-run mode is not supported for legacy compaction for now"
+            )));
+        }
+
         // High level strategy for compaction / image creation:
         //
         // 1. First, calculate the desired "partitioning" of the
@@ -1641,6 +1739,7 @@ impl Timeline {
     pub(crate) async fn compact_with_gc(
         self: &Arc<Self>,
         cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         use std::collections::BTreeSet;
@@ -1664,12 +1763,16 @@ impl Timeline {
         )
         .await?;
 
-        info!("running enhanced gc bottom-most compaction");
+        let dry_run = flags.contains(CompactFlags::DryRun);
+
+        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
 
         scopeguard::defer! {
             info!("done enhanced gc bottom-most compaction");
         };
 
+        let mut stat = CompactionStatistics::default();
+
         // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
         // The layer selection has the following properties:
         // 1. If a layer is in the selection, all layers below it are in the selection.
@@ -1740,6 +1843,9 @@ impl Timeline {
                 let key_range = desc.get_key_range();
                 delta_split_points.insert(key_range.start);
                 delta_split_points.insert(key_range.end);
+                stat.visit_delta_layer(desc.file_size());
+            } else {
+                stat.visit_image_layer(desc.file_size());
             }
         }
         let mut delta_layers = Vec::new();
@@ -1775,6 +1881,8 @@ impl Timeline {
             tline: &Arc<Timeline>,
             lowest_retain_lsn: Lsn,
             ctx: &RequestContext,
+            stats: &mut CompactionStatistics,
+            dry_run: bool,
             last_batch: bool,
         ) -> anyhow::Result<Option<FlushDeltaResult>> {
             // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
@@ -1831,6 +1939,7 @@ impl Timeline {
                     let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
                     drop(guard);
                     if layer_generation == tline.generation {
+                        stats.discard_delta_layer();
                         // TODO: depending on whether we design this compaction process to run along with
                         // other compactions, there could be layer map modifications after we drop the
                         // layer guard, and in case it creates duplicated layer key, we will still error
@@ -1857,6 +1966,10 @@ impl Timeline {
             for (key, lsn, val) in deltas {
                 delta_layer_writer.put_value(key, lsn, val, ctx).await?;
             }
+            stats.produce_delta_layer(delta_layer_writer.size());
+            if dry_run {
+                return Ok(None);
+            }
             let delta_layer = delta_layer_writer
                 .finish(delta_key.key_range.end, tline, ctx)
                 .await?;
@@ -1951,6 +2064,13 @@ impl Timeline {
         let mut current_delta_split_point = 0;
         let mut delta_layers = Vec::new();
         while let Some((key, lsn, val)) = merge_iter.next().await? {
+            if cancel.is_cancelled() {
+                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
+            }
+            match val {
+                Value::Image(_) => stat.visit_image_key(&val),
+                Value::WalRecord(_) => stat.visit_wal_key(&val),
+            }
             if last_key.is_none() || last_key.as_ref() == Some(&key) {
                 if last_key.is_none() {
                     last_key = Some(key);
@@ -1958,6 +2078,7 @@ impl Timeline {
                 accumulated_values.push((key, lsn, val));
             } else {
                 let last_key = last_key.as_mut().unwrap();
+                stat.on_unique_key_visited();
                 let retention = self
                     .generate_key_retention(
                         *last_key,
@@ -1974,6 +2095,7 @@ impl Timeline {
                         *last_key,
                         &mut delta_values,
                         image_layer_writer.as_mut(),
+                        &mut stat,
                         ctx,
                     )
                     .await?;
@@ -1986,6 +2108,8 @@ impl Timeline {
                         self,
                         lowest_retain_lsn,
                         ctx,
+                        &mut stat,
+                        dry_run,
                         false,
                     )
                     .await?,
@@ -1998,6 +2122,7 @@ impl Timeline {
 
         let last_key = last_key.expect("no keys produced during compaction");
         // TODO: move this part to the loop body
+        stat.on_unique_key_visited();
         let retention = self
             .generate_key_retention(
                 last_key,
@@ -2014,6 +2139,7 @@ impl Timeline {
                 last_key,
                 &mut delta_values,
                 image_layer_writer.as_mut(),
+                &mut stat,
                 ctx,
             )
             .await?;
@@ -2026,6 +2152,8 @@ impl Timeline {
                 self,
                 lowest_retain_lsn,
                 ctx,
+                &mut stat,
+                dry_run,
                 true,
             )
             .await?,
@@ -2033,12 +2161,28 @@ impl Timeline {
         assert!(delta_values.is_empty(), "unprocessed keys");
 
         let image_layer = if discard_image_layer {
+            stat.discard_image_layer();
             None
         } else if let Some(writer) = image_layer_writer {
-            Some(writer.finish(self, ctx).await?)
+            stat.produce_image_layer(writer.size());
+            if !dry_run {
+                Some(writer.finish(self, ctx).await?)
+            } else {
+                None
+            }
         } else {
             None
         };
+
+        info!(
+            "gc-compaction statistics: {}",
+            serde_json::to_string(&stat)?
+        );
+
+        if dry_run {
+            return Ok(());
+        }
+
         info!(
             "produced {} delta layers and {} image layers",
             delta_layers.len(),
@@ -2062,6 +2206,7 @@ impl Timeline {
         let mut layer_selection = layer_selection;
         layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
         compact_to.extend(image_layer);
+
         // Step 3: Place back to the layer map.
         {
             let mut guard = self.layers.write().await;

From c89ee814e1cbd2d6111d30afd4f13a05fe501ca2 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Aug 2024 10:52:01 +0300
Subject: [PATCH 09/48] fix: make Timeline::set_disk_consistent_lsn use
 fetch_max (#8311)

now it is safe to use from multiple callers, as we have two callers.
---
 pageserver/src/tenant/timeline.rs | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5c268bf875..05bf4eac8b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4123,17 +4123,11 @@ impl Timeline {
 
     /// Return true if the value changed
     ///
-    /// This function must only be used from the layer flush task, and may not be called concurrently.
+    /// This function must only be used from the layer flush task.
     fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
-        // We do a simple load/store cycle: that's why this function isn't safe for concurrent use.
-        let old_value = self.disk_consistent_lsn.load();
-        if new_value != old_value {
-            assert!(new_value >= old_value);
-            self.disk_consistent_lsn.store(new_value);
-            true
-        } else {
-            false
-        }
+        let old_value = self.disk_consistent_lsn.fetch_max(new_value);
+        assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
+        new_value != old_value
     }
 
     /// Update metadata file

From b2bc5795be03fae0fa579d06430865ee83bbbfb1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Aug 2024 12:09:56 +0300
Subject: [PATCH 10/48] feat: persistent gc blocking (#8600)

Currently, we do not have facilities to persistently block GC on a
tenant for whatever reason. We could do a tenant configuration update,
but that is risky for generation numbers and would also be transient.
Introduce a `gc_block` facility in the tenant, which manages per
timeline blocking reasons.

Additionally, add HTTP endpoints for enabling/disabling manual gc
blocking for a specific timeline. For debugging, individual tenant
status now includes a similar string representation logged when GC is
skipped.

Cc: #6994
---
 libs/pageserver_api/src/models.rs             |   9 +
 pageserver/src/http/openapi_spec.yml          |  39 ++++
 pageserver/src/http/routes.rs                 |  76 +++++++
 pageserver/src/tenant.rs                      |  30 +++
 pageserver/src/tenant/gc_block.rs             | 213 ++++++++++++++++++
 .../src/tenant/remote_timeline_client.rs      | 117 ++++++++++
 .../tenant/remote_timeline_client/index.rs    | 133 +++++++++++
 pageserver/src/tenant/timeline.rs             |  16 ++
 pageserver/src/tenant/timeline/delete.rs      |   2 +
 test_runner/fixtures/pageserver/http.py       |  16 ++
 .../regress/test_timeline_gc_blocking.py      |  67 ++++++
 11 files changed, 718 insertions(+)
 create mode 100644 pageserver/src/tenant/gc_block.rs
 create mode 100644 test_runner/regress/test_timeline_gc_blocking.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 591c45d908..b541bba6a1 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -637,6 +637,13 @@ pub struct TenantInfo {
     pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
     pub attachment_status: TenantAttachmentStatus,
     pub generation: u32,
+
+    /// Opaque explanation if gc is being blocked.
+    ///
+    /// Only looked up for the individual tenant detail, not the listing. This is purely for
+    /// debugging, not included in openapi.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gc_blocking: Option<String>,
 }
 
 #[derive(Serialize, Deserialize, Clone)]
@@ -1427,6 +1434,7 @@ mod tests {
             current_physical_size: Some(42),
             attachment_status: TenantAttachmentStatus::Attached,
             generation: 1,
+            gc_blocking: None,
         };
         let expected_active = json!({
             "id": original_active.id.to_string(),
@@ -1449,6 +1457,7 @@ mod tests {
             current_physical_size: Some(42),
             attachment_status: TenantAttachmentStatus::Attached,
             generation: 1,
+            gc_blocking: None,
         };
         let expected_broken = json!({
             "id": original_broken.id.to_string(),
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 59e646d0ca..4656f2c93a 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -308,6 +308,45 @@ paths:
             application/json:
               schema:
                 type: string
+
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: Persistently add a gc blocking at the tenant level because of this timeline
+      responses:
+        "200":
+          description: OK
+
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: Persistently remove a tenant level gc blocking for this timeline
+      responses:
+        "200":
+          description: OK
+
   /v1/tenant/{tenant_shard_id}/location_config:
     parameters:
       - name: tenant_shard_id
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 117f2c5869..fdab780bfb 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -935,6 +935,7 @@ async fn tenant_list_handler(
             generation: (*gen)
                 .into()
                 .expect("Tenants are always attached with a generation"),
+            gc_blocking: None,
         })
         .collect::<Vec<TenantInfo>>();
 
@@ -986,6 +987,7 @@ async fn tenant_status(
                     .generation()
                     .into()
                     .expect("Tenants are always attached with a generation"),
+                gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")),
             },
             walredo: tenant.wal_redo_manager_status(),
             timelines: tenant.list_timeline_ids(),
@@ -1226,6 +1228,72 @@ async fn evict_timeline_layer_handler(
     }
 }
 
+async fn timeline_gc_blocking_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    block_or_unblock_gc(request, true).await
+}
+
+async fn timeline_gc_unblocking_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    block_or_unblock_gc(request, false).await
+}
+
+/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
+///
+/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
+async fn block_or_unblock_gc(
+    request: Request<Body>,
+    block: bool,
+) -> Result<Response<Body>, ApiError> {
+    use crate::tenant::{
+        remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized,
+    };
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let state = get_state(&request);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+    let timeline = tenant.get_timeline(timeline_id, true)?;
+
+    let fut = async {
+        if block {
+            timeline.block_gc(&tenant).await.map(|_| ())
+        } else {
+            timeline.unblock_gc(&tenant).await
+        }
+    };
+
+    let span = tracing::info_span!(
+        "block_or_unblock_gc",
+        tenant_id = %tenant_shard_id.tenant_id,
+        shard_id = %tenant_shard_id.shard_slug(),
+        timeline_id = %timeline_id,
+        block = block,
+    );
+
+    let res = fut.instrument(span).await;
+
+    res.map_err(|e| {
+        if e.is::<NotInitialized>() || e.is::<WaitCompletionError>() {
+            ApiError::ShuttingDown
+        } else {
+            ApiError::InternalServerError(e)
+        }
+    })?;
+
+    json_response(StatusCode::OK, ())
+}
+
 /// Get tenant_size SVG graph along with the JSON data.
 fn synthetic_size_html_response(
     inputs: ModelInputs,
@@ -2904,6 +2972,14 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
             |r| api_handler(r, evict_timeline_layer_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
+            |r| api_handler(r, timeline_gc_blocking_handler),
+        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
+            |r| api_handler(r, timeline_gc_unblocking_handler),
+        )
         .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
             api_handler(r, secondary_upload_handler)
         })
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 72d3aedd05..de9b55d847 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -148,6 +148,7 @@ pub(crate) mod timeline;
 
 pub mod size;
 
+mod gc_block;
 pub(crate) mod throttle;
 
 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -303,6 +304,12 @@ pub struct Tenant {
     /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
     ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
 
+    /// `index_part.json` based gc blocking reason tracking.
+    ///
+    /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before
+    /// proceeding.
+    pub(crate) gc_block: gc_block::GcBlock,
+
     l0_flush_global_state: L0FlushGlobalState,
 }
 
@@ -1036,6 +1043,8 @@ impl Tenant {
             }
         }
 
+        let mut gc_blocks = HashMap::new();
+
         // For every timeline, download the metadata file, scan the local directory,
         // and build a layer map that contains an entry for each remote and local
         // layer file.
@@ -1045,6 +1054,16 @@ impl Tenant {
                 .remove(&timeline_id)
                 .expect("just put it in above");
 
+            if let Some(blocking) = index_part.gc_blocking.as_ref() {
+                // could just filter these away, but it helps while testing
+                anyhow::ensure!(
+                    !blocking.reasons.is_empty(),
+                    "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons"
+                );
+                let prev = gc_blocks.insert(timeline_id, blocking.reasons);
+                assert!(prev.is_none());
+            }
+
             // TODO again handle early failure
             self.load_remote_timeline(
                 timeline_id,
@@ -1089,6 +1108,8 @@ impl Tenant {
         // IndexPart is the source of truth.
         self.clean_up_timelines(&existent_timelines)?;
 
+        self.gc_block.set_scanned(gc_blocks);
+
         fail::fail_point!("attach-before-activate", |_| {
             anyhow::bail!("attach-before-activate");
         });
@@ -1679,6 +1700,14 @@ impl Tenant {
             }
         }
 
+        let _guard = match self.gc_block.start().await {
+            Ok(guard) => guard,
+            Err(reasons) => {
+                info!("Skipping GC: {reasons}");
+                return Ok(GcResult::default());
+            }
+        };
+
         self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
             .await
     }
@@ -2691,6 +2720,7 @@ impl Tenant {
             )),
             tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
             ongoing_timeline_detach: std::sync::Mutex::default(),
+            gc_block: Default::default(),
             l0_flush_global_state,
         }
     }
diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs
new file mode 100644
index 0000000000..8b41ba1746
--- /dev/null
+++ b/pageserver/src/tenant/gc_block.rs
@@ -0,0 +1,213 @@
+use std::collections::HashMap;
+
+use utils::id::TimelineId;
+
+use super::remote_timeline_client::index::GcBlockingReason;
+
+type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
+
+#[derive(Default)]
+pub(crate) struct GcBlock {
+    /// The timelines which have current reasons to block gc.
+    ///
+    /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
+    /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
+    reasons: std::sync::Mutex<Storage>,
+    blocking: tokio::sync::Mutex<()>,
+}
+
+impl GcBlock {
+    /// Start another gc iteration.
+    ///
+    /// Returns a guard to be held for the duration of gc iteration to allow synchronizing with
+    /// it's ending, or if not currently possible, a value describing the reasons why not.
+    ///
+    /// Cancellation safe.
+    pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
+        let reasons = {
+            let g = self.reasons.lock().unwrap();
+
+            // TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in
+            // tests, we use everything. we should warn if the gc has been consecutively blocked
+            // for more than 1h (within single tenant session?).
+            BlockingReasons::clean_and_summarize(g)
+        };
+
+        if let Some(reasons) = reasons {
+            Err(reasons)
+        } else {
+            Ok(Guard {
+                _inner: self.blocking.lock().await,
+            })
+        }
+    }
+
+    pub(crate) fn summary(&self) -> Option<BlockingReasons> {
+        let g = self.reasons.lock().unwrap();
+
+        BlockingReasons::summarize(&g)
+    }
+
+    /// Start blocking gc for this one timeline for the given reason.
+    ///
+    /// This is not a guard based API but instead it mimics set API. The returned future will not
+    /// resolve until an existing gc round has completed.
+    ///
+    /// Returns true if this block was new, false if gc was already blocked for this reason.
+    ///
+    /// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will
+    /// keep the gc blocking reason.
+    pub(crate) async fn insert(
+        &self,
+        timeline: &super::Timeline,
+        reason: GcBlockingReason,
+    ) -> anyhow::Result<bool> {
+        let (added, uploaded) = {
+            let mut g = self.reasons.lock().unwrap();
+            let set = g.entry(timeline.timeline_id).or_default();
+            let added = set.insert(reason);
+
+            // LOCK ORDER: intentionally hold the lock, see self.reasons.
+            let uploaded = timeline
+                .remote_client
+                .schedule_insert_gc_block_reason(reason)?;
+
+            (added, uploaded)
+        };
+
+        uploaded.await?;
+
+        // ensure that any ongoing gc iteration has completed
+        drop(self.blocking.lock().await);
+
+        Ok(added)
+    }
+
+    /// Remove blocking gc for this one timeline and the given reason.
+    pub(crate) async fn remove(
+        &self,
+        timeline: &super::Timeline,
+        reason: GcBlockingReason,
+    ) -> anyhow::Result<()> {
+        use std::collections::hash_map::Entry;
+
+        super::span::debug_assert_current_span_has_tenant_and_timeline_id();
+
+        let (remaining_blocks, uploaded) = {
+            let mut g = self.reasons.lock().unwrap();
+            match g.entry(timeline.timeline_id) {
+                Entry::Occupied(mut oe) => {
+                    let set = oe.get_mut();
+                    set.remove(reason);
+                    if set.is_empty() {
+                        oe.remove();
+                    }
+                }
+                Entry::Vacant(_) => {
+                    // we must still do the index_part.json update regardless, in case we had earlier
+                    // been cancelled
+                }
+            }
+
+            let remaining_blocks = g.len();
+
+            // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
+            let uploaded = timeline
+                .remote_client
+                .schedule_remove_gc_block_reason(reason)?;
+
+            (remaining_blocks, uploaded)
+        };
+        uploaded.await?;
+
+        // no need to synchronize with gc iteration again
+
+        if remaining_blocks > 0 {
+            tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked");
+        } else {
+            tracing::info!("gc is now unblocked for the tenant");
+        }
+
+        Ok(())
+    }
+
+    pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
+        let unblocked = {
+            let mut g = self.reasons.lock().unwrap();
+            if g.is_empty() {
+                return;
+            }
+
+            g.remove(&timeline.timeline_id);
+
+            BlockingReasons::clean_and_summarize(g).is_none()
+        };
+
+        if unblocked {
+            tracing::info!("gc is now unblocked following deletion");
+        }
+    }
+
+    /// Initialize with the non-deleted timelines of this tenant.
+    pub(crate) fn set_scanned(&self, scanned: Storage) {
+        let mut g = self.reasons.lock().unwrap();
+        assert!(g.is_empty());
+        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
+
+        if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
+            tracing::info!(summary=?reasons, "initialized with gc blocked");
+        }
+    }
+}
+
+pub(super) struct Guard<'a> {
+    _inner: tokio::sync::MutexGuard<'a, ()>,
+}
+
+#[derive(Debug)]
+pub(crate) struct BlockingReasons {
+    timelines: usize,
+    reasons: enumset::EnumSet<GcBlockingReason>,
+}
+
+impl std::fmt::Display for BlockingReasons {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{} timelines block for {:?}",
+            self.timelines, self.reasons
+        )
+    }
+}
+
+impl BlockingReasons {
+    fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
+        let mut reasons = enumset::EnumSet::empty();
+        g.retain(|_key, value| {
+            reasons = reasons.union(*value);
+            !value.is_empty()
+        });
+        if !g.is_empty() {
+            Some(BlockingReasons {
+                timelines: g.len(),
+                reasons,
+            })
+        } else {
+            None
+        }
+    }
+
+    fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
+        if g.is_empty() {
+            None
+        } else {
+            let reasons = g
+                .values()
+                .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
+            Some(BlockingReasons {
+                timelines: g.len(),
+                reasons,
+            })
+        }
+    }
+}
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 9e021c7e35..1344fe4192 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -800,6 +800,123 @@ impl RemoteTimelineClient {
             .context("wait completion")
     }
 
+    /// Adds a gc blocking reason for this timeline if one does not exist already.
+    ///
+    /// A retryable step of timeline detach ancestor.
+    ///
+    /// Returns a future which waits until the completion of the upload.
+    pub(crate) fn schedule_insert_gc_block_reason(
+        self: &Arc<Self>,
+        reason: index::GcBlockingReason,
+    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
+    {
+        let maybe_barrier = {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard.initialized_mut()?;
+
+            if let index::GcBlockingReason::DetachAncestor = reason {
+                if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
+                    drop(guard);
+                    panic!("cannot start detach ancestor if there is nothing to detach from");
+                }
+            }
+
+            let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason));
+
+            let current = upload_queue.dirty.gc_blocking.as_ref();
+            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
+
+            match (current, uploaded) {
+                (x, y) if wanted(x) && wanted(y) => None,
+                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
+                // Usual case: !wanted(x) && !wanted(y)
+                //
+                // Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to
+                // turn on and off some reason.
+                (x, y) => {
+                    if !wanted(x) && wanted(y) {
+                        // this could be avoided by having external in-memory synchronization, like
+                        // timeline detach ancestor
+                        warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason");
+                    }
+
+                    // at this point, the metadata must always show that there is a parent
+                    upload_queue.dirty.gc_blocking = current
+                        .map(|x| x.with_reason(reason))
+                        .or_else(|| Some(index::GcBlocking::started_now_for(reason)));
+                    self.schedule_index_upload(upload_queue)?;
+                    Some(self.schedule_barrier0(upload_queue))
+                }
+            }
+        };
+
+        Ok(async move {
+            if let Some(barrier) = maybe_barrier {
+                Self::wait_completion0(barrier).await?;
+            }
+            Ok(())
+        })
+    }
+
+    /// Removes a gc blocking reason for this timeline if one exists.
+    ///
+    /// A retryable step of timeline detach ancestor.
+    ///
+    /// Returns a future which waits until the completion of the upload.
+    pub(crate) fn schedule_remove_gc_block_reason(
+        self: &Arc<Self>,
+        reason: index::GcBlockingReason,
+    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
+    {
+        let maybe_barrier = {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard.initialized_mut()?;
+
+            if let index::GcBlockingReason::DetachAncestor = reason {
+                if !upload_queue
+                    .clean
+                    .0
+                    .lineage
+                    .is_detached_from_original_ancestor()
+                {
+                    drop(guard);
+                    panic!("cannot complete timeline_ancestor_detach while not detached");
+                }
+            }
+
+            let wanted = |x: Option<&index::GcBlocking>| {
+                x.is_none() || x.is_some_and(|b| !b.blocked_by(reason))
+            };
+
+            let current = upload_queue.dirty.gc_blocking.as_ref();
+            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
+
+            match (current, uploaded) {
+                (x, y) if wanted(x) && wanted(y) => None,
+                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
+                (x, y) => {
+                    if !wanted(x) && wanted(y) {
+                        warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)");
+                    }
+
+                    upload_queue.dirty.gc_blocking =
+                        current.as_ref().and_then(|x| x.without_reason(reason));
+                    assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
+                    // FIXME: bogus ?
+                    self.schedule_index_upload(upload_queue)?;
+                    Some(self.schedule_barrier0(upload_queue))
+                }
+            }
+        };
+
+        Ok(async move {
+            if let Some(barrier) = maybe_barrier {
+                Self::wait_completion0(barrier).await?;
+            }
+            Ok(())
+        })
+    }
+
     /// Launch an upload operation in the background; the file is added to be included in next
     /// `index_part.json` upload.
     pub(crate) fn schedule_layer_file_upload(
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 3075df022e..8e6290030d 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -60,6 +60,9 @@ pub struct IndexPart {
     #[serde(default)]
     pub(crate) lineage: Lineage,
 
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub(crate) gc_blocking: Option<GcBlocking>,
+
     /// Describes the kind of aux files stored in the timeline.
     ///
     /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
@@ -101,6 +104,7 @@ impl IndexPart {
             deleted_at: None,
             archived_at: None,
             lineage: Default::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         }
     }
@@ -251,6 +255,64 @@ impl Lineage {
     }
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub(crate) struct GcBlocking {
+    pub(crate) started_at: NaiveDateTime,
+    pub(crate) reasons: enumset::EnumSet<GcBlockingReason>,
+}
+
+#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)]
+#[enumset(serialize_repr = "list")]
+pub(crate) enum GcBlockingReason {
+    Manual,
+    DetachAncestor,
+}
+
+impl GcBlocking {
+    pub(super) fn started_now_for(reason: GcBlockingReason) -> Self {
+        GcBlocking {
+            started_at: chrono::Utc::now().naive_utc(),
+            reasons: enumset::EnumSet::only(reason),
+        }
+    }
+
+    /// Returns true if the given reason is one of the reasons why the gc is blocked.
+    pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool {
+        self.reasons.contains(reason)
+    }
+
+    /// Returns a version of self with the given reason.
+    pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self {
+        assert!(!self.blocked_by(reason));
+        let mut reasons = self.reasons;
+        reasons.insert(reason);
+
+        Self {
+            started_at: self.started_at,
+            reasons,
+        }
+    }
+
+    /// Returns a version of self without the given reason. Assumption is that if
+    /// there are no more reasons, we can unblock the gc by returning `None`.
+    pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option<Self> {
+        assert!(self.blocked_by(reason));
+
+        if self.reasons.len() == 1 {
+            None
+        } else {
+            let mut reasons = self.reasons;
+            assert!(reasons.remove(reason));
+            assert!(!reasons.is_empty());
+
+            Some(Self {
+                started_at: self.started_at,
+                reasons,
+            })
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -292,6 +354,7 @@ mod tests {
             deleted_at: None,
             archived_at: None,
             lineage: Lineage::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -335,6 +398,7 @@ mod tests {
             deleted_at: None,
             archived_at: None,
             lineage: Lineage::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -379,6 +443,7 @@ mod tests {
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             archived_at: None,
             lineage: Lineage::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -426,6 +491,7 @@ mod tests {
             deleted_at: None,
             archived_at: None,
             lineage: Lineage::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -468,6 +534,7 @@ mod tests {
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             archived_at: None,
             lineage: Lineage::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -513,6 +580,7 @@ mod tests {
                 reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                 original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
             },
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -563,6 +631,7 @@ mod tests {
                 reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                 original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
             },
+            gc_blocking: None,
             last_aux_file_policy: Some(AuxFilePolicy::V2),
         };
 
@@ -618,6 +687,7 @@ mod tests {
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             archived_at: None,
             lineage: Default::default(),
+            gc_blocking: None,
             last_aux_file_policy: Default::default(),
         };
 
@@ -674,6 +744,7 @@ mod tests {
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
             lineage: Default::default(),
+            gc_blocking: None,
             last_aux_file_policy: Default::default(),
         };
 
@@ -681,6 +752,68 @@ mod tests {
         assert_eq!(part, expected);
     }
 
+    #[test]
+    fn v9_indexpart_is_parsed() {
+        let example = r#"{
+            "version": 9,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata": {
+                "disk_consistent_lsn": "0/16960E8",
+                "prev_record_lsn": "0/1696070",
+                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/1696070",
+                "initdb_lsn": "0/1696070",
+                "pg_version": 14
+            },
+            "gc_blocking": {
+                "started_at": "2024-07-19T09:00:00.123",
+                "reasons": ["DetachAncestor"]
+            }
+        }"#;
+
+        let expected = IndexPart {
+            version: 9,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::new(
+                Lsn::from_str("0/16960E8").unwrap(),
+                Some(Lsn::from_str("0/1696070").unwrap()),
+                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
+                Lsn::INVALID,
+                Lsn::from_str("0/1696070").unwrap(),
+                Lsn::from_str("0/1696070").unwrap(),
+                14,
+            ).with_recalculated_checksum().unwrap(),
+            deleted_at: None,
+            lineage: Default::default(),
+            gc_blocking: Some(GcBlocking {
+                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
+                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
+            }),
+            last_aux_file_policy: Default::default(),
+            archived_at: None,
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
     fn parse_naive_datetime(s: &str) -> NaiveDateTime {
         chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 05bf4eac8b..79bfd1ebb2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5698,6 +5698,22 @@ impl Timeline {
         }
     }
 
+    /// Persistently blocks gc for `Manual` reason.
+    ///
+    /// Returns true if no such block existed before, false otherwise.
+    pub(crate) async fn block_gc(&self, tenant: &super::Tenant) -> anyhow::Result<bool> {
+        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
+        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
+        tenant.gc_block.insert(self, GcBlockingReason::Manual).await
+    }
+
+    /// Persistently unblocks gc for `Manual` reason.
+    pub(crate) async fn unblock_gc(&self, tenant: &super::Tenant) -> anyhow::Result<()> {
+        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
+        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
+        tenant.gc_block.remove(self, GcBlockingReason::Manual).await
+    }
+
     #[cfg(test)]
     pub(super) fn force_advance_lsn(self: &Arc<Timeline>, new_lsn: Lsn) {
         self.last_record_lsn.advance(new_lsn);
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 05178c38b4..b03dbb092e 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -230,6 +230,8 @@ impl DeleteTimelineFlow {
         // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
         timeline.shutdown(super::ShutdownMode::Hard).await;
 
+        tenant.gc_block.before_delete(&timeline);
+
         fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
             Err(anyhow::anyhow!(
                 "failpoint: timeline-delete-before-index-deleted-at"
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 192324f086..61e2204b23 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -556,6 +556,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
+    def timeline_block_gc(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/block_gc",
+        )
+        log.info(f"Got GC request response code: {res.status_code}")
+        self.verbose_error(res)
+
+    def timeline_unblock_gc(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/unblock_gc",
+        )
+        log.info(f"Got GC request response code: {res.status_code}")
+        self.verbose_error(res)
+
     def timeline_compact(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py
new file mode 100644
index 0000000000..24de894687
--- /dev/null
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -0,0 +1,67 @@
+import time
+
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+from fixtures.pageserver.utils import wait_timeline_detail_404
+
+
+def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"}
+    )
+    ps = env.pageserver
+    http = ps.http_client()
+
+    foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant)
+
+    gc_active_line = ".* gc_loop.*: [12] timelines need GC"
+    gc_skipped_line = ".* gc_loop.*: Skipping GC: .*"
+    init_gc_skipped = ".*: initialized with gc blocked.*"
+
+    tenant_before = http.tenant_status(env.initial_tenant)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_active_line)
+
+    assert ps.log_contains(gc_skipped_line, offset) is None
+
+    http.timeline_block_gc(env.initial_tenant, foo_branch)
+
+    tenant_after = http.tenant_status(env.initial_tenant)
+    assert tenant_before != tenant_after
+    gc_blocking = tenant_after["gc_blocking"]
+    assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+
+    ps.restart()
+    ps.quiesce_tenants()
+
+    _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+
+    # deletion unblocks gc
+    http.timeline_delete(env.initial_tenant, foo_branch)
+    wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_active_line, offset)
+
+    http.timeline_block_gc(env.initial_tenant, env.initial_timeline)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+
+    # removing the manual block also unblocks gc
+    http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_active_line, offset)
+
+
+def wait_for_another_gc_round():
+    time.sleep(2)

From 608c3cedbf53a3d076a8003005dab22cb913c529 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 6 Aug 2024 10:14:01 +0100
Subject: [PATCH 11/48] pageserver: remove legacy read path (#8601)

## Problem

We have been maintaining two read paths (legacy and vectored) for a
while now. The legacy read-path was only used for cross validation in some tests.

## Summary of changes
* Tweak all tests that were using the legacy read path to use the
vectored read path instead
* Remove the read path dispatching based on the pageserver configs
* Remove the legacy read path code

We will be able to remove the single blob io code in
`pageserver/src/tenant/blob_io.rs` when https://github.com/neondatabase/neon/issues/7386 is complete.

Closes https://github.com/neondatabase/neon/issues/8005
---
 pageserver/src/tenant.rs                      |  53 +-
 pageserver/src/tenant/storage_layer.rs        |  15 -
 .../src/tenant/storage_layer/delta_layer.rs   |  91 +--
 .../src/tenant/storage_layer/image_layer.rs   |  44 +-
 .../tenant/storage_layer/inmemory_layer.rs    |  76 +--
 pageserver/src/tenant/storage_layer/layer.rs  |  73 +--
 .../src/tenant/storage_layer/layer/tests.rs   |  38 +-
 pageserver/src/tenant/timeline.rs             | 581 ++----------------
 8 files changed, 121 insertions(+), 850 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index de9b55d847..989ed0d4eb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4122,7 +4122,7 @@ pub(crate) mod harness {
 
 #[cfg(test)]
 mod tests {
-    use std::collections::BTreeMap;
+    use std::collections::{BTreeMap, BTreeSet};
 
     use super::*;
     use crate::keyspace::KeySpaceAccum;
@@ -4797,7 +4797,7 @@ mod tests {
         lsn: Lsn,
         repeat: usize,
         key_count: usize,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
         let compact = true;
         bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
     }
@@ -4810,7 +4810,9 @@ mod tests {
         repeat: usize,
         key_count: usize,
         compact: bool,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
+        let mut inserted: HashMap<Key, BTreeSet<Lsn>> = Default::default();
+
         let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
         let mut blknum = 0;
 
@@ -4831,6 +4833,7 @@ mod tests {
                         ctx,
                     )
                     .await?;
+                inserted.entry(test_key).or_default().insert(lsn);
                 writer.finish_write(lsn);
                 drop(writer);
 
@@ -4855,7 +4858,7 @@ mod tests {
             assert_eq!(res.layers_removed, 0, "this never removes anything");
         }
 
-        Ok(())
+        Ok(inserted)
     }
 
     //
@@ -4902,7 +4905,7 @@ mod tests {
             .await?;
 
         let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
+        let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
 
         let guard = tline.layers.read().await;
         guard.layer_map().dump(true, &ctx).await?;
@@ -4963,9 +4966,39 @@ mod tests {
                     &ctx,
                 )
                 .await;
-            tline
-                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
-                .await;
+
+            let mut expected_lsns: HashMap<Key, Lsn> = Default::default();
+            let mut expect_missing = false;
+            let mut key = read.start().unwrap();
+            while key != read.end().unwrap() {
+                if let Some(lsns) = inserted.get(&key) {
+                    let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn);
+                    match expected_lsn {
+                        Some(lsn) => {
+                            expected_lsns.insert(key, *lsn);
+                        }
+                        None => {
+                            expect_missing = true;
+                            break;
+                        }
+                    }
+                } else {
+                    expect_missing = true;
+                    break;
+                }
+
+                key = key.next();
+            }
+
+            if expect_missing {
+                assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_))));
+            } else {
+                for (key, image) in vectored_res? {
+                    let expected_lsn = expected_lsns.get(&key).expect("determined above");
+                    let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn));
+                    assert_eq!(image?, expected_image);
+                }
+            }
         }
 
         Ok(())
@@ -5015,10 +5048,6 @@ mod tests {
             )
             .await;
 
-        child_timeline
-            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
-            .await;
-
         let images = vectored_res?;
         assert!(images.is_empty());
         Ok(())
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 59d3e1ce09..ab32a6035e 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -435,21 +435,6 @@ impl ReadableLayer {
     }
 }
 
-/// Return value from [`Layer::get_value_reconstruct_data`]
-#[derive(Clone, Copy, Debug)]
-pub enum ValueReconstructResult {
-    /// Got all the data needed to reconstruct the requested page
-    Complete,
-    /// This layer didn't contain all the required data, the caller should look up
-    /// the predecessor layer at the returned LSN and collect more data from there.
-    Continue,
-
-    /// This layer didn't contain data needed to reconstruct the page version at
-    /// the returned LSN. This is usually considered an error, but might be OK
-    /// in some circumstances.
-    Missing,
-}
-
 /// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
 /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
 /// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index e50fc2a266..a17dd28547 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,7 +36,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::storage_layer::Layer;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -826,95 +826,6 @@ impl DeltaLayerInner {
         })
     }
 
-    pub(super) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        let mut need_image = true;
-        // Scan the page versions backwards, starting from `lsn`.
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            &block_reader,
-        );
-        let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
-
-        let mut offsets: Vec<(Lsn, u64)> = Vec::new();
-
-        tree_reader
-            .visit(
-                &search_key.0,
-                VisitDirection::Backwards,
-                |key, value| {
-                    let blob_ref = BlobRef(value);
-                    if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
-                        return false;
-                    }
-                    let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
-                    if entry_lsn < lsn_range.start {
-                        return false;
-                    }
-                    offsets.push((entry_lsn, blob_ref.pos()));
-
-                    !blob_ref.will_init()
-                },
-                &RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
-                    .build(),
-            )
-            .await?;
-
-        let ctx = &RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::DeltaLayerValue)
-            .build();
-
-        // Ok, 'offsets' now contains the offsets of all the entries we need to read
-        let cursor = block_reader.block_cursor();
-        let mut buf = Vec::new();
-        for (entry_lsn, pos) in offsets {
-            cursor
-                .read_blob_into_buf(pos, &mut buf, ctx)
-                .await
-                .with_context(|| {
-                    format!("Failed to read blob from virtual file {}", self.file.path)
-                })?;
-            let val = Value::des(&buf).with_context(|| {
-                format!(
-                    "Failed to deserialize file blob from virtual file {}",
-                    self.file.path
-                )
-            })?;
-            match val {
-                Value::Image(img) => {
-                    reconstruct_state.img = Some((entry_lsn, img));
-                    need_image = false;
-                    break;
-                }
-                Value::WalRecord(rec) => {
-                    let will_init = rec.will_init();
-                    reconstruct_state.records.push((entry_lsn, rec));
-                    if will_init {
-                        // This WAL record initializes the page, so no need to go further back
-                        need_image = false;
-                        break;
-                    }
-                }
-            }
-        }
-
-        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
-        if need_image {
-            Ok(ValueReconstructResult::Continue)
-        } else {
-            Ok(ValueReconstructResult::Complete)
-        }
-    }
-
     // Look up the keys in the provided keyspace and update
     // the reconstruct state with whatever is found.
     //
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index f4f48aaf16..b2173455ab 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -32,9 +32,7 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::{
-    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
-};
+use crate::tenant::storage_layer::LayerAccessStats;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -429,46 +427,6 @@ impl ImageLayerInner {
         })
     }
 
-    pub(super) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader =
-            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
-
-        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
-        key.write_to_byte_slice(&mut keybuf);
-        if let Some(offset) = tree_reader
-            .get(
-                &keybuf,
-                &RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::ImageLayerBtreeNode)
-                    .build(),
-            )
-            .await?
-        {
-            let blob = block_reader
-                .block_cursor()
-                .read_blob(
-                    offset,
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::ImageLayerValue)
-                        .build(),
-                )
-                .await
-                .with_context(|| format!("failed to read value from offset {}", offset))?;
-            let value = Bytes::from(blob);
-
-            reconstruct_state.img = Some((self.lsn, value));
-            Ok(ValueReconstructResult::Complete)
-        } else {
-            Ok(ValueReconstructResult::Missing)
-        }
-    }
-
     // Look up the keys in the provided keyspace and update
     // the reconstruct state with whatever is found.
     pub(super) async fn get_values_reconstruct_data(
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index f9010ae8a6..6abc89c2ed 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,11 +10,10 @@ use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
 use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
-use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::{l0_flush, page_cache, walrecord};
-use anyhow::{anyhow, ensure, Result};
+use anyhow::{anyhow, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -33,10 +32,7 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::{RwLock, RwLockWriteGuard};
 
-use super::{
-    DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
-    ValuesReconstructState,
-};
+use super::{DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValuesReconstructState};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
 pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
@@ -55,9 +51,6 @@ pub struct InMemoryLayer {
     /// Writes are only allowed when this is `None`.
     pub(crate) end_lsn: OnceLock<Lsn>,
 
-    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
-    local_path_str: Arc<str>,
-
     /// Used for traversal path. Cached representation of the in-memory layer after frozen.
     frozen_local_path_str: OnceLock<Arc<str>>,
 
@@ -248,12 +241,6 @@ impl InMemoryLayer {
         self.start_lsn..self.end_lsn_or_max()
     }
 
-    pub(crate) fn local_path_str(&self) -> &Arc<str> {
-        self.frozen_local_path_str
-            .get()
-            .unwrap_or(&self.local_path_str)
-    }
-
     /// debugging function to print out the contents of the layer
     ///
     /// this is likely completly unused
@@ -303,60 +290,6 @@ impl InMemoryLayer {
         Ok(())
     }
 
-    /// Look up given value in the layer.
-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        ensure!(lsn_range.start >= self.start_lsn);
-        let mut need_image = true;
-
-        let ctx = RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::InMemoryLayer)
-            .build();
-
-        let inner = self.inner.read().await;
-
-        let reader = inner.file.block_cursor();
-
-        // Scan the page versions backwards, starting from `lsn`.
-        if let Some(vec_map) = inner.index.get(&key) {
-            let slice = vec_map.slice_range(lsn_range);
-            for (entry_lsn, pos) in slice.iter().rev() {
-                let buf = reader.read_blob(*pos, &ctx).await?;
-                let value = Value::des(&buf)?;
-                match value {
-                    Value::Image(img) => {
-                        reconstruct_state.img = Some((*entry_lsn, img));
-                        return Ok(ValueReconstructResult::Complete);
-                    }
-                    Value::WalRecord(rec) => {
-                        let will_init = rec.will_init();
-                        reconstruct_state.records.push((*entry_lsn, rec));
-                        if will_init {
-                            // This WAL record initializes the page, so no need to go further back
-                            need_image = false;
-                            break;
-                        }
-                    }
-                }
-            }
-        }
-
-        // release lock on 'inner'
-
-        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
-        if need_image {
-            Ok(ValueReconstructResult::Continue)
-        } else {
-            Ok(ValueReconstructResult::Complete)
-        }
-    }
-
     // Look up the keys in the provided keyspace and update
     // the reconstruct state with whatever is found.
     //
@@ -458,11 +391,6 @@ impl InMemoryLayer {
 
         Ok(InMemoryLayer {
             file_id: key,
-            local_path_str: {
-                let mut buf = String::new();
-                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
-                buf.into()
-            },
             frozen_local_path_str: OnceLock::new(),
             conf,
             timeline_id,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 5732779e44..cee2fe7342 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -24,8 +24,7 @@ use super::delta_layer::{self, DeltaEntry};
 use super::image_layer::{self};
 use super::{
     AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
-    ValuesReconstructState,
+    LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
 };
 
 use utils::generation::Generation;
@@ -301,42 +300,6 @@ impl Layer {
         self.0.delete_on_drop();
     }
 
-    /// Return data needed to reconstruct given page at LSN.
-    ///
-    /// It is up to the caller to collect more data from the previous layer and
-    /// perform WAL redo, if necessary.
-    ///
-    /// # Cancellation-Safety
-    ///
-    /// This method is cancellation-safe.
-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        use anyhow::ensure;
-
-        let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
-        self.0.access_stats.record_access(ctx);
-
-        if self.layer_desc().is_delta {
-            ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
-            ensure!(self.layer_desc().key_range.contains(&key));
-        } else {
-            ensure!(self.layer_desc().key_range.contains(&key));
-            ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
-            ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
-        }
-
-        layer
-            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
-            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
-            .await
-            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
-    }
-
     pub(crate) async fn get_values_reconstruct_data(
         &self,
         keyspace: KeySpace,
@@ -441,10 +404,6 @@ impl Layer {
         &self.0.path
     }
 
-    pub(crate) fn debug_str(&self) -> &Arc<str> {
-        &self.0.debug_str
-    }
-
     pub(crate) fn metadata(&self) -> LayerFileMetadata {
         self.0.metadata()
     }
@@ -519,7 +478,7 @@ impl Layer {
 ///
 /// However when we want something evicted, we cannot evict it right away as there might be current
 /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
-/// read with [`Layer::get_value_reconstruct_data`].
+/// read with [`Layer::get_values_reconstruct_data`].
 ///
 /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
 #[derive(Debug)]
@@ -600,9 +559,6 @@ struct LayerInner {
     /// Full path to the file; unclear if this should exist anymore.
     path: Utf8PathBuf,
 
-    /// String representation of the layer, used for traversal id.
-    debug_str: Arc<str>,
-
     desc: PersistentLayerDesc,
 
     /// Timeline access is needed for remote timeline client and metrics.
@@ -836,9 +792,6 @@ impl LayerInner {
 
         LayerInner {
             conf,
-            debug_str: {
-                format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
-            },
             path: local_path,
             desc,
             timeline: Arc::downgrade(timeline),
@@ -1759,28 +1712,6 @@ impl DownloadedLayer {
             .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
     }
 
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        owner: &Arc<LayerInner>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        use LayerKind::*;
-
-        match self.get(owner, ctx).await? {
-            Delta(d) => {
-                d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
-                    .await
-            }
-            Image(i) => {
-                i.get_value_reconstruct_data(key, reconstruct_data, ctx)
-                    .await
-            }
-        }
-    }
-
     async fn get_values_reconstruct_data(
         &self,
         keyspace: KeySpace,
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 423cde001c..6b0d5f09ff 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -50,13 +50,26 @@ async fn smoke_test() {
     // all layers created at pageserver are like `layer`, initialized with strong
     // Arc<DownloadedLayer>.
 
+    let controlfile_keyspace = KeySpace {
+        ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()],
+    };
+
     let img_before = {
-        let mut data = ValueReconstructState::default();
+        let mut data = ValuesReconstructState::default();
         layer
-            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .get_values_reconstruct_data(
+                controlfile_keyspace.clone(),
+                Lsn(0x10)..Lsn(0x11),
+                &mut data,
+                &ctx,
+            )
             .await
             .unwrap();
-        data.img
+        data.keys
+            .remove(&CONTROLFILE_KEY)
+            .expect("must be present")
+            .expect("should not error")
+            .img
             .take()
             .expect("tenant harness writes the control file")
     };
@@ -74,13 +87,24 @@ async fn smoke_test() {
 
     // on accesses when the layer is evicted, it will automatically be downloaded.
     let img_after = {
-        let mut data = ValueReconstructState::default();
+        let mut data = ValuesReconstructState::default();
         layer
-            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .get_values_reconstruct_data(
+                controlfile_keyspace.clone(),
+                Lsn(0x10)..Lsn(0x11),
+                &mut data,
+                &ctx,
+            )
             .instrument(download_span.clone())
             .await
             .unwrap();
-        data.img.take().unwrap()
+        data.keys
+            .remove(&CONTROLFILE_KEY)
+            .expect("must be present")
+            .expect("should not error")
+            .img
+            .take()
+            .expect("tenant harness writes the control file")
     };
 
     assert_eq!(img_before, img_after);
@@ -830,7 +854,7 @@ async fn eviction_cancellation_on_drop() {
 fn layer_size() {
     assert_eq!(size_of::<LayerAccessStats>(), 8);
     assert_eq!(size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(size_of::<LayerInner>(), 312);
+    assert_eq!(size_of::<LayerInner>(), 296);
     // it also has the utf8 path
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 79bfd1ebb2..5a02fd4a4c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,8 +22,8 @@ use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{
-        AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
-        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
+        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
+        NON_INHERITED_SPARSE_RANGE,
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
@@ -59,10 +59,7 @@ use std::{
     collections::{BTreeMap, HashMap, HashSet},
     sync::atomic::AtomicU64,
 };
-use std::{
-    cmp::{max, min},
-    ops::ControlFlow,
-};
+use std::{cmp::min, ops::ControlFlow};
 use std::{
     collections::btree_map::Entry,
     ops::{Deref, Range},
@@ -87,8 +84,8 @@ use crate::{
     disk_usage_eviction_task::finite_f32,
     tenant::storage_layer::{
         AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
-        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult,
-        ValueReconstructState, ValuesReconstructState,
+        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState,
+        ValuesReconstructState,
     },
 };
 use crate::{
@@ -543,7 +540,6 @@ pub struct MissingKeyError {
     cont_lsn: Lsn,
     request_lsn: Lsn,
     ancestor_lsn: Option<Lsn>,
-    traversal_path: Vec<TraversalPathItem>,
     backtrace: Option<std::backtrace::Backtrace>,
 }
 
@@ -564,18 +560,6 @@ impl std::fmt::Display for MissingKeyError {
             write!(f, ", ancestor {}", ancestor_lsn)?;
         }
 
-        if !self.traversal_path.is_empty() {
-            writeln!(f)?;
-        }
-
-        for (r, c, l) in &self.traversal_path {
-            writeln!(
-                f,
-                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
-                r, c, l,
-            )?;
-        }
-
         if let Some(ref backtrace) = self.backtrace {
             write!(f, "\n{}", backtrace)?;
         }
@@ -918,119 +902,44 @@ impl Timeline {
 
         self.timeline_get_throttle.throttle(ctx, 1).await;
 
-        match self.conf.get_impl {
-            GetImpl::Legacy => {
-                let reconstruct_state = ValueReconstructState {
-                    records: Vec::new(),
-                    img: None,
-                };
+        let keyspace = KeySpace {
+            ranges: vec![key..key.next()],
+        };
 
-                self.get_impl(key, lsn, reconstruct_state, ctx).await
-            }
-            GetImpl::Vectored => {
-                let keyspace = KeySpace {
-                    ranges: vec![key..key.next()],
-                };
+        // Initialise the reconstruct state for the key with the cache
+        // entry returned above.
+        let mut reconstruct_state = ValuesReconstructState::new();
 
-                // Initialise the reconstruct state for the key with the cache
-                // entry returned above.
-                let mut reconstruct_state = ValuesReconstructState::new();
+        let vectored_res = self
+            .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+            .await;
 
-                let vectored_res = self
-                    .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
-                    .await;
-
-                if self.conf.validate_vectored_get {
-                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
-                        .await;
-                }
-
-                let key_value = vectored_res?.pop_first();
-                match key_value {
-                    Some((got_key, value)) => {
-                        if got_key != key {
-                            error!(
-                                "Expected {}, but singular vectored get returned {}",
-                                key, got_key
-                            );
-                            Err(PageReconstructError::Other(anyhow!(
-                                "Singular vectored get returned wrong key"
-                            )))
-                        } else {
-                            value
-                        }
-                    }
-                    None => Err(PageReconstructError::MissingKey(MissingKeyError {
-                        key,
-                        shard: self.shard_identity.get_shard_number(&key),
-                        cont_lsn: Lsn(0),
-                        request_lsn: lsn,
-                        ancestor_lsn: None,
-                        traversal_path: Vec::new(),
-                        backtrace: None,
-                    })),
+        let key_value = vectored_res?.pop_first();
+        match key_value {
+            Some((got_key, value)) => {
+                if got_key != key {
+                    error!(
+                        "Expected {}, but singular vectored get returned {}",
+                        key, got_key
+                    );
+                    Err(PageReconstructError::Other(anyhow!(
+                        "Singular vectored get returned wrong key"
+                    )))
+                } else {
+                    value
                 }
             }
+            None => Err(PageReconstructError::MissingKey(MissingKeyError {
+                key,
+                shard: self.shard_identity.get_shard_number(&key),
+                cont_lsn: Lsn(0),
+                request_lsn: lsn,
+                ancestor_lsn: None,
+                backtrace: None,
+            })),
         }
     }
 
-    /// Not subject to [`Self::timeline_get_throttle`].
-    async fn get_impl(
-        &self,
-        key: Key,
-        lsn: Lsn,
-        mut reconstruct_state: ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        // XXX: structured stats collection for layer eviction here.
-        trace!(
-            "get page request for {}@{} from task kind {:?}",
-            key,
-            lsn,
-            ctx.task_kind()
-        );
-
-        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
-            .for_get_kind(GetKind::Singular)
-            .start_timer();
-        let path = self
-            .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
-            .await?;
-        timer.stop_and_record();
-
-        let start = Instant::now();
-        let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
-        let elapsed = start.elapsed();
-        crate::metrics::RECONSTRUCT_TIME
-            .for_get_kind(GetKind::Singular)
-            .observe(elapsed.as_secs_f64());
-
-        if cfg!(feature = "testing")
-            && res.is_err()
-            && !matches!(res, Err(PageReconstructError::Cancelled))
-        {
-            // it can only be walredo issue
-            use std::fmt::Write;
-
-            let mut msg = String::new();
-
-            path.into_iter().for_each(|(res, cont_lsn, layer)| {
-                writeln!(
-                    msg,
-                    "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
-                    layer,
-                )
-                .expect("string grows")
-            });
-
-            // this is to rule out or provide evidence that we could in some cases read a duplicate
-            // walrecord
-            tracing::info!("walredo failed, path:\n{msg}");
-        }
-
-        res
-    }
-
     pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
     pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;
 
@@ -1080,28 +989,14 @@ impl Timeline {
             .throttle(ctx, key_count as usize)
             .await;
 
-        let res = match self.conf.get_vectored_impl {
-            GetVectoredImpl::Sequential => {
-                self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
-            }
-            GetVectoredImpl::Vectored => {
-                let vectored_res = self
-                    .get_vectored_impl(
-                        keyspace.clone(),
-                        lsn,
-                        &mut ValuesReconstructState::new(),
-                        ctx,
-                    )
-                    .await;
-
-                if self.conf.validate_vectored_get {
-                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
-                        .await;
-                }
-
-                vectored_res
-            }
-        };
+        let res = self
+            .get_vectored_impl(
+                keyspace.clone(),
+                lsn,
+                &mut ValuesReconstructState::new(),
+                ctx,
+            )
+            .await;
 
         if let Some((metric, start)) = start {
             let elapsed = start.elapsed();
@@ -1190,65 +1085,6 @@ impl Timeline {
         vectored_res
     }
 
-    /// Not subject to [`Self::timeline_get_throttle`].
-    pub(super) async fn get_vectored_sequential_impl(
-        &self,
-        keyspace: KeySpace,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        let mut values = BTreeMap::new();
-
-        for range in keyspace.ranges {
-            let mut key = range.start;
-            while key != range.end {
-                let block = self
-                    .get_impl(key, lsn, ValueReconstructState::default(), ctx)
-                    .await;
-
-                use PageReconstructError::*;
-                match block {
-                    Err(Cancelled) => return Err(GetVectoredError::Cancelled),
-                    Err(MissingKey(_))
-                        if NON_INHERITED_RANGE.contains(&key)
-                            || NON_INHERITED_SPARSE_RANGE.contains(&key) =>
-                    {
-                        // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range.
-                        // When we add more types of keys into the page server, we should revisit this part of code and throw errors
-                        // accordingly.
-                        key = key.next();
-                    }
-                    Err(MissingKey(err)) => {
-                        return Err(GetVectoredError::MissingKey(err));
-                    }
-                    Err(Other(err))
-                        if err
-                            .to_string()
-                            .contains("downloading evicted layer file failed") =>
-                    {
-                        return Err(GetVectoredError::Other(err))
-                    }
-                    Err(Other(err))
-                        if err
-                            .chain()
-                            .any(|cause| cause.to_string().contains("layer loading failed")) =>
-                    {
-                        // The intent here is to achieve error parity with the vectored read path.
-                        // When vectored read fails to load a layer it fails the whole read, hence
-                        // we mimic this behaviour here to keep the validation happy.
-                        return Err(GetVectoredError::Other(err));
-                    }
-                    _ => {
-                        values.insert(key, block);
-                        key = key.next();
-                    }
-                }
-            }
-        }
-
-        Ok(values)
-    }
-
     pub(super) async fn get_vectored_impl(
         &self,
         keyspace: KeySpace,
@@ -1319,113 +1155,6 @@ impl Timeline {
         Ok(results)
     }
 
-    /// Not subject to [`Self::timeline_get_throttle`].
-    pub(super) async fn validate_get_vectored_impl(
-        &self,
-        vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,
-        keyspace: KeySpace,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) {
-        if keyspace.overlaps(&Key::metadata_key_range()) {
-            // skip validation for metadata key range
-            return;
-        }
-
-        let sequential_res = self
-            .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx)
-            .await;
-
-        fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool {
-            use GetVectoredError::*;
-            match (lhs, rhs) {
-                (Oversized(l), Oversized(r)) => l == r,
-                (InvalidLsn(l), InvalidLsn(r)) => l == r,
-                (MissingKey(l), MissingKey(r)) => l.key == r.key,
-                (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true,
-                (Other(_), Other(_)) => true,
-                _ => false,
-            }
-        }
-
-        match (&sequential_res, vectored_res) {
-            (Err(GetVectoredError::Cancelled), _) => {},
-            (_, Err(GetVectoredError::Cancelled)) => {},
-            (Err(seq_err), Ok(_)) => {
-                panic!(concat!("Sequential get failed with {}, but vectored get did not",
-                               " - keyspace={:?} lsn={}"),
-                       seq_err, keyspace, lsn) },
-            (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
-                // Sequential get runs after vectored get, so it is possible for the later
-                // to time out while waiting for its ancestor's Lsn to become ready and for the
-                // former to succeed (it essentially has a doubled wait time).
-            },
-            (Ok(_), Err(vec_err)) => {
-                panic!(concat!("Vectored get failed with {}, but sequential get did not",
-                               " - keyspace={:?} lsn={}"),
-                       vec_err, keyspace, lsn) },
-            (Err(seq_err), Err(vec_err)) => {
-                assert!(errors_match(seq_err, vec_err),
-                        "Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")},
-            (Ok(seq_values), Ok(vec_values)) => {
-                seq_values.iter().zip(vec_values.iter()).for_each(|((seq_key, seq_res), (vec_key, vec_res))| {
-                    assert_eq!(seq_key, vec_key);
-                    match (seq_res, vec_res) {
-                        (Ok(seq_blob), Ok(vec_blob)) => {
-                            Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob);
-                        },
-                        (Err(err), Ok(_)) => {
-                            panic!(
-                                concat!("Sequential get failed with {} for key {}, but vectored get did not",
-                                        " - keyspace={:?} lsn={}"),
-                                err, seq_key, keyspace, lsn) },
-                        (Ok(_), Err(err)) => {
-                            panic!(
-                                concat!("Vectored get failed with {} for key {}, but sequential get did not",
-                                        " - keyspace={:?} lsn={}"),
-                                err, seq_key, keyspace, lsn) },
-                        (Err(_), Err(_)) => {}
-                    }
-                })
-            }
-        }
-    }
-
-    fn validate_key_equivalence(
-        key: &Key,
-        keyspace: &KeySpace,
-        lsn: Lsn,
-        seq: &Bytes,
-        vec: &Bytes,
-    ) {
-        if *key == AUX_FILES_KEY {
-            // The value reconstruct of AUX_FILES_KEY from records is not deterministic
-            // since it uses a hash map under the hood. Hence, deserialise both results
-            // before comparing.
-            let seq_aux_dir_res = AuxFilesDirectory::des(seq);
-            let vec_aux_dir_res = AuxFilesDirectory::des(vec);
-            match (&seq_aux_dir_res, &vec_aux_dir_res) {
-                (Ok(seq_aux_dir), Ok(vec_aux_dir)) => {
-                    assert_eq!(
-                        seq_aux_dir, vec_aux_dir,
-                        "Mismatch for key {} - keyspace={:?} lsn={}",
-                        key, keyspace, lsn
-                    );
-                }
-                (Err(_), Err(_)) => {}
-                _ => {
-                    panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}");
-                }
-            }
-        } else {
-            // All other keys should reconstruct deterministically, so we simply compare the blobs.
-            assert_eq!(
-                seq, vec,
-                "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}"
-            );
-        }
-    }
-
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub(crate) fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last
@@ -3215,228 +2944,7 @@ impl Timeline {
     }
 }
 
-type TraversalId = Arc<str>;
-
-trait TraversalLayerExt {
-    fn traversal_id(&self) -> TraversalId;
-}
-
-impl TraversalLayerExt for Layer {
-    fn traversal_id(&self) -> TraversalId {
-        Arc::clone(self.debug_str())
-    }
-}
-
-impl TraversalLayerExt for Arc<InMemoryLayer> {
-    fn traversal_id(&self) -> TraversalId {
-        Arc::clone(self.local_path_str())
-    }
-}
-
 impl Timeline {
-    ///
-    /// Get a handle to a Layer for reading.
-    ///
-    /// The returned Layer might be from an ancestor timeline, if the
-    /// segment hasn't been updated on this timeline yet.
-    ///
-    /// This function takes the current timeline's locked LayerMap as an argument,
-    /// so callers can avoid potential race conditions.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
-    async fn get_reconstruct_data(
-        &self,
-        key: Key,
-        request_lsn: Lsn,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<Vec<TraversalPathItem>, PageReconstructError> {
-        // Start from the current timeline.
-        let mut timeline_owned;
-        let mut timeline = self;
-
-        let mut read_count = scopeguard::guard(0, |cnt| {
-            crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
-        });
-
-        // For debugging purposes, collect the path of layers that we traversed
-        // through. It's included in the error message if we fail to find the key.
-        let mut traversal_path = Vec::<TraversalPathItem>::new();
-
-        let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
-            *cached_lsn
-        } else {
-            Lsn(0)
-        };
-
-        // 'prev_lsn' tracks the last LSN that we were at in our search. It's used
-        // to check that each iteration make some progress, to break infinite
-        // looping if something goes wrong.
-        let mut prev_lsn = None;
-
-        let mut result = ValueReconstructResult::Continue;
-        let mut cont_lsn = Lsn(request_lsn.0 + 1);
-
-        'outer: loop {
-            if self.cancel.is_cancelled() {
-                return Err(PageReconstructError::Cancelled);
-            }
-
-            // The function should have updated 'state'
-            //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
-            match result {
-                ValueReconstructResult::Complete => return Ok(traversal_path),
-                ValueReconstructResult::Continue => {
-                    // If we reached an earlier cached page image, we're done.
-                    if cont_lsn == cached_lsn + 1 {
-                        return Ok(traversal_path);
-                    }
-                    if let Some(prev) = prev_lsn {
-                        if prev <= cont_lsn {
-                            // Didn't make any progress in last iteration. Error out to avoid
-                            // getting stuck in the loop.
-                            return Err(PageReconstructError::MissingKey(MissingKeyError {
-                                key,
-                                shard: self.shard_identity.get_shard_number(&key),
-                                cont_lsn: Lsn(cont_lsn.0 - 1),
-                                request_lsn,
-                                ancestor_lsn: Some(timeline.ancestor_lsn),
-                                traversal_path,
-                                backtrace: None,
-                            }));
-                        }
-                    }
-                    prev_lsn = Some(cont_lsn);
-                }
-                ValueReconstructResult::Missing => {
-                    return Err(PageReconstructError::MissingKey(MissingKeyError {
-                        key,
-                        shard: self.shard_identity.get_shard_number(&key),
-                        cont_lsn,
-                        request_lsn,
-                        ancestor_lsn: None,
-                        traversal_path,
-                        backtrace: if cfg!(test) {
-                            Some(std::backtrace::Backtrace::force_capture())
-                        } else {
-                            None
-                        },
-                    }));
-                }
-            }
-
-            // Recurse into ancestor if needed
-            if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
-                if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
-                    trace!(
-                        "going into ancestor {}, cont_lsn is {}",
-                        timeline.ancestor_lsn,
-                        cont_lsn
-                    );
-
-                    timeline_owned = timeline
-                        .get_ready_ancestor_timeline(ancestor_timeline, ctx)
-                        .await?;
-                    timeline = &*timeline_owned;
-                    prev_lsn = None;
-                    continue 'outer;
-                }
-            }
-
-            let guard = timeline.layers.read().await;
-            let layers = guard.layer_map();
-
-            // Check the open and frozen in-memory layers first, in order from newest
-            // to oldest.
-            if let Some(open_layer) = &layers.open_layer {
-                let start_lsn = open_layer.get_lsn_range().start;
-                if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display());
-                    // Get all the data needed to reconstruct the page version from this layer.
-                    // But if we have an older cached page image, no need to go past that.
-                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-
-                    let open_layer = open_layer.clone();
-                    drop(guard);
-
-                    result = match open_layer
-                        .get_value_reconstruct_data(
-                            key,
-                            lsn_floor..cont_lsn,
-                            reconstruct_state,
-                            ctx,
-                        )
-                        .await
-                    {
-                        Ok(result) => result,
-                        Err(e) => return Err(PageReconstructError::from(e)),
-                    };
-                    cont_lsn = lsn_floor;
-                    *read_count += 1;
-                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
-                    continue 'outer;
-                }
-            }
-            for frozen_layer in layers.frozen_layers.iter().rev() {
-                let start_lsn = frozen_layer.get_lsn_range().start;
-                if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display());
-                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-
-                    let frozen_layer = frozen_layer.clone();
-                    drop(guard);
-
-                    result = match frozen_layer
-                        .get_value_reconstruct_data(
-                            key,
-                            lsn_floor..cont_lsn,
-                            reconstruct_state,
-                            ctx,
-                        )
-                        .await
-                    {
-                        Ok(result) => result,
-                        Err(e) => return Err(PageReconstructError::from(e)),
-                    };
-                    cont_lsn = lsn_floor;
-                    *read_count += 1;
-                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
-                    continue 'outer;
-                }
-            }
-
-            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
-                let layer = guard.get_from_desc(&layer);
-                drop(guard);
-                // Get all the data needed to reconstruct the page version from this layer.
-                // But if we have an older cached page image, no need to go past that.
-                let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                result = match layer
-                    .get_value_reconstruct_data(key, lsn_floor..cont_lsn, reconstruct_state, ctx)
-                    .await
-                {
-                    Ok(result) => result,
-                    Err(e) => return Err(PageReconstructError::from(e)),
-                };
-                cont_lsn = lsn_floor;
-                *read_count += 1;
-                traversal_path.push((result, cont_lsn, layer.traversal_id()));
-                continue 'outer;
-            } else if timeline.ancestor_timeline.is_some() {
-                // Nothing on this timeline. Traverse to parent
-                result = ValueReconstructResult::Continue;
-                cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
-                continue 'outer;
-            } else {
-                // Nothing found
-                result = ValueReconstructResult::Missing;
-                continue 'outer;
-            }
-        }
-    }
-
     #[allow(clippy::doc_lazy_continuation)]
     /// Get the data needed to reconstruct all keys in the provided keyspace
     ///
@@ -3530,7 +3038,6 @@ impl Timeline {
                 cont_lsn,
                 request_lsn,
                 ancestor_lsn: Some(timeline.ancestor_lsn),
-                traversal_path: vec![],
                 backtrace: None,
             }));
         }
@@ -5895,8 +5402,6 @@ impl Timeline {
     }
 }
 
-type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
-
 /// Tracking writes ingestion does to a particular in-memory layer.
 ///
 /// Cleared upon freezing a layer.

From 814b09025030ab814f655971bebf767ec80f4399 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Aug 2024 13:45:41 +0300
Subject: [PATCH 12/48] chore: bump index part version (#8611)

#8600 missed the hunk changing index_part.json informative version.
Include it in this PR, in addition add more non-warning index_part.json
versions to scrubber.
---
 pageserver/src/tenant/remote_timeline_client/index.rs | 5 +++--
 storage_scrubber/src/checks.rs                        | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 8e6290030d..90453b1922 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -88,10 +88,11 @@ impl IndexPart {
     /// - 6: last_aux_file_policy is added.
     /// - 7: metadata_bytes is no longer written, but still read
     /// - 8: added `archived_at`
-    const LATEST_VERSION: usize = 8;
+    /// - 9: +gc_blocking
+    const LATEST_VERSION: usize = 9;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 5aa9e88c40..14788515dd 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -92,7 +92,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                             .push(format!("index_part.json version: {}", index_part.version()))
                     }
 
-                    let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(2);
+                    let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(3);
                     if !newest_versions.any(|ip| ip == &index_part.version()) {
                         info!(
                             "index_part.json version is not latest: {}",

From 44d99757995806bf94e9a2a10e88ff51026512ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 6 Aug 2024 12:51:39 +0200
Subject: [PATCH 13/48] storage_scrubber: migrate scan_safekeeper_metadata to
 remote_storage (#8595)

Migrates the safekeeper-specific parts of `ScanMetadata` to
GenericRemoteStorage, making it Azure-ready.

Part of https://github.com/neondatabase/neon/issues/7547
---
 storage_scrubber/src/metadata_stream.rs       | 32 ++++++++++++++++++-
 .../src/scan_safekeeper_metadata.rs           | 20 ++++++------
 2 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index c702c0c312..54812ffc94 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -4,7 +4,7 @@ use anyhow::{anyhow, Context};
 use async_stream::{stream, try_stream};
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
 use futures::StreamExt;
-use remote_storage::{GenericRemoteStorage, ListingMode};
+use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath};
 use tokio_stream::Stream;
 
 use crate::{
@@ -276,3 +276,33 @@ pub(crate) fn stream_listing<'a>(
         }
     }
 }
+
+pub(crate) fn stream_listing_generic<'a>(
+    remote_client: &'a GenericRemoteStorage,
+    target: &'a S3Target,
+) -> impl Stream<Item = anyhow::Result<(RemotePath, Option<ListingObject>)>> + 'a {
+    let listing_mode = if target.delimiter.is_empty() {
+        ListingMode::NoDelimiter
+    } else {
+        ListingMode::WithDelimiter
+    };
+    try_stream! {
+        let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
+            remote_client,
+            listing_mode,
+            target,
+        ));
+        while let Some(list) = objects_stream.next().await {
+            let list = list?;
+            if target.delimiter.is_empty() {
+                for key in list.keys {
+                    yield (key.key.clone(), Some(key));
+                }
+            } else {
+                for key in list.prefixes {
+                    yield (key, None);
+                }
+            }
+        }
+    }
+}
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 553adf8f46..08a4541c5c 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -1,10 +1,10 @@
 use std::{collections::HashSet, str::FromStr, sync::Arc};
 
-use aws_sdk_s3::Client;
 use futures::stream::{StreamExt, TryStreamExt};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::{XLogFileName, PG_TLI};
+use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
 use tokio_postgres::types::PgLsn;
 use tracing::{error, info, trace};
@@ -14,8 +14,9 @@ use utils::{
 };
 
 use crate::{
-    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
-    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
+    cloud_admin_api::CloudAdminApiClient, init_remote_generic,
+    metadata_stream::stream_listing_generic, BucketConfig, ConsoleConfig, NodeKind, RootTarget,
+    TenantShardTimelineId,
 };
 
 /// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
@@ -106,7 +107,7 @@ pub async fn scan_safekeeper_metadata(
     let timelines = client.query(&query, &[]).await?;
     info!("loaded {} timelines", timelines.len());
 
-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
+    let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Safekeeper).await?;
     let console_config = ConsoleConfig::from_env()?;
     let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
 
@@ -119,7 +120,7 @@ pub async fn scan_safekeeper_metadata(
         let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
         let ttid = TenantTimelineId::new(tenant_id, timeline_id);
         check_timeline(
-            &s3_client,
+            &remote_client,
             &target,
             &cloud_admin_api_client,
             ttid,
@@ -156,7 +157,7 @@ struct TimelineCheckResult {
 /// errors are logged to stderr; returns Ok(true) if timeline is consistent,
 /// Ok(false) if not, Err if failed to check.
 async fn check_timeline(
-    s3_client: &Client,
+    remote_client: &GenericRemoteStorage,
     root: &RootTarget,
     api_client: &CloudAdminApiClient,
     ttid: TenantTimelineId,
@@ -187,12 +188,13 @@ async fn check_timeline(
     // we need files, so unset it.
     timeline_dir_target.delimiter = String::new();
 
-    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
+    let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
     while let Some(obj) = stream.next().await {
-        let obj = obj?;
-        let key = obj.key();
+        let (key, _obj) = obj?;
 
         let seg_name = key
+            .get_path()
+            .as_str()
             .strip_prefix(&timeline_dir_target.prefix_in_bucket)
             .expect("failed to extract segment name");
         expected_segfiles.remove(seg_name);

From bd276839add2c9a417263854a98a4aee3c8e24e4 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Aug 2024 11:52:48 +0300
Subject: [PATCH 14/48] Add package-mode=false to poetry.

We don't use it for packaging, and 'poetry install' will soon error
otherwise. Also remove name and version fields as these are not required for
non-packaging mode.
---
 pyproject.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0d5782ac7c..36a1e24ca1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,8 +1,7 @@
 [tool.poetry]
-name = "neon"
-version = "0.1.0"
 description = ""
 authors = []
+package-mode = false
 
 [tool.poetry.dependencies]
 python = "^3.9"

From 13e794a35cf9e03794b3b904e4b6804645a8ee57 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 6 Aug 2024 13:08:55 +0200
Subject: [PATCH 15/48] Add a test using Debezium as a client for the logical
 replication (#8568)

## Problem
We need to test the logical replication with some external consumers.
## Summary of changes
A test of the logical replication with Debezium as a consumer was added.
---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/pg-clients.yml              |  26 ++-
 poetry.lock                                   |  16 +-
 pyproject.toml                                |   2 +
 test_runner/fixtures/utils.py                 |   7 +-
 test_runner/logical_repl/README.md            |  22 ++
 .../clickhouse/docker-compose.yml             |   9 +
 .../logical_repl/debezium/docker-compose.yml  |  24 +++
 .../{test_log_repl.py => test_clickhouse.py}  |  16 +-
 test_runner/logical_repl/test_debezium.py     | 189 ++++++++++++++++++
 9 files changed, 297 insertions(+), 14 deletions(-)
 create mode 100644 test_runner/logical_repl/README.md
 create mode 100644 test_runner/logical_repl/clickhouse/docker-compose.yml
 create mode 100644 test_runner/logical_repl/debezium/docker-compose.yml
 rename test_runner/logical_repl/{test_log_repl.py => test_clickhouse.py} (85%)
 create mode 100644 test_runner/logical_repl/test_debezium.py

diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
index 55b68ccdb5..23a2e3876c 100644
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -66,7 +66,31 @@ jobs:
         ports:
           - 9000:9000
           - 8123:8123
-
+      zookeeper:
+        image: quay.io/debezium/zookeeper:2.7
+        ports:
+          - 2181:2181
+      kafka:
+        image: quay.io/debezium/kafka:2.7
+        env:
+          ZOOKEEPER_CONNECT: "zookeeper:2181"
+          KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
+          KAFKA_BROKER_ID: 1
+          KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
+          KAFKA_JMX_PORT: 9991
+        ports:
+          - 9092:9092
+      debezium:
+        image: quay.io/debezium/connect:2.7
+        env:
+          BOOTSTRAP_SERVERS: kafka:9092
+          GROUP_ID: 1
+          CONFIG_STORAGE_TOPIC: debezium-config
+          OFFSET_STORAGE_TOPIC: debezium-offset
+          STATUS_STORAGE_TOPIC: debezium-status
+          DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
+        ports:
+          - 8083:8083
     steps:
       - uses: actions/checkout@v4
 
diff --git a/poetry.lock b/poetry.lock
index d7a3dde65b..9026824558 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1514,6 +1514,20 @@ files = [
 [package.dependencies]
 six = "*"
 
+[[package]]
+name = "kafka-python"
+version = "2.0.2"
+description = "Pure Python client for Apache Kafka"
+optional = false
+python-versions = "*"
+files = [
+    {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
+    {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
+]
+
+[package.extras]
+crc32c = ["crc32c"]
+
 [[package]]
 name = "lazy-object-proxy"
 version = "1.10.0"
@@ -3357,4 +3371,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
+content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72"
diff --git a/pyproject.toml b/pyproject.toml
index 36a1e24ca1..cfb569b2ba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
 websockets = "^12.0"
 clickhouse-connect = "^0.7.16"
+kafka-python = "^2.0.2"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
@@ -74,6 +75,7 @@ module = [
     "allure.*",
     "allure_commons.*",
     "allure_pytest.*",
+    "kafka.*",
 ]
 ignore_missing_imports = true
 
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 7f54eb0b0a..4dc9f7caae 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -389,7 +389,10 @@ WaitUntilRet = TypeVar("WaitUntilRet")
 
 
 def wait_until(
-    number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet]
+    number_of_iterations: int,
+    interval: float,
+    func: Callable[[], WaitUntilRet],
+    show_intermediate_error=False,
 ) -> WaitUntilRet:
     """
     Wait until 'func' returns successfully, without exception. Returns the
@@ -402,6 +405,8 @@ def wait_until(
         except Exception as e:
             log.info("waiting for %s iteration %s failed", func, i + 1)
             last_exception = e
+            if show_intermediate_error:
+                log.info(e)
             time.sleep(interval)
             continue
         return res
diff --git a/test_runner/logical_repl/README.md b/test_runner/logical_repl/README.md
new file mode 100644
index 0000000000..8eca056dda
--- /dev/null
+++ b/test_runner/logical_repl/README.md
@@ -0,0 +1,22 @@
+# Logical replication tests
+
+## Clickhouse
+
+```bash
+export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
+
+docker compose -f clickhouse/docker-compose.yml up -d
+pytest -m remote_cluster -k test_clickhouse
+docker compose -f clickhouse/docker-compose.yml down
+```
+
+## Debezium
+
+```bash
+export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
+
+docker compose -f debezium/docker-compose.yml up -d
+pytest -m remote_cluster -k test_debezium
+docker compose -f debezium/docker-compose.yml down
+
+```
\ No newline at end of file
diff --git a/test_runner/logical_repl/clickhouse/docker-compose.yml b/test_runner/logical_repl/clickhouse/docker-compose.yml
new file mode 100644
index 0000000000..e00038b811
--- /dev/null
+++ b/test_runner/logical_repl/clickhouse/docker-compose.yml
@@ -0,0 +1,9 @@
+services:
+  clickhouse:
+    image: clickhouse/clickhouse-server
+    user: "101:101"
+    container_name: clickhouse
+    hostname: clickhouse
+    ports:
+      - 127.0.0.1:8123:8123
+      - 127.0.0.1:9000:9000
diff --git a/test_runner/logical_repl/debezium/docker-compose.yml b/test_runner/logical_repl/debezium/docker-compose.yml
new file mode 100644
index 0000000000..fee127a2fd
--- /dev/null
+++ b/test_runner/logical_repl/debezium/docker-compose.yml
@@ -0,0 +1,24 @@
+services:
+  zookeeper:
+    image: quay.io/debezium/zookeeper:2.7
+  kafka:
+    image: quay.io/debezium/kafka:2.7
+    environment:
+      ZOOKEEPER_CONNECT: "zookeeper:2181"
+      KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
+      KAFKA_BROKER_ID: 1
+      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
+      KAFKA_JMX_PORT: 9991
+    ports:
+      - 127.0.0.1:9092:9092
+  debezium:
+    image: quay.io/debezium/connect:2.7
+    environment:
+      BOOTSTRAP_SERVERS: kafka:9092
+      GROUP_ID: 1
+      CONFIG_STORAGE_TOPIC: debezium-config
+      OFFSET_STORAGE_TOPIC: debezium-offset
+      STATUS_STORAGE_TOPIC: debezium-status
+      DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
+    ports:
+      - 127.0.0.1:8083:8083
diff --git a/test_runner/logical_repl/test_log_repl.py b/test_runner/logical_repl/test_clickhouse.py
similarity index 85%
rename from test_runner/logical_repl/test_log_repl.py
rename to test_runner/logical_repl/test_clickhouse.py
index 0a1aecfe2b..c5ed9bc8af 100644
--- a/test_runner/logical_repl/test_log_repl.py
+++ b/test_runner/logical_repl/test_clickhouse.py
@@ -1,8 +1,9 @@
 """
-Test the logical replication in Neon with the different consumers
+Test the logical replication in Neon with ClickHouse as a consumer
 """
 
 import hashlib
+import os
 import time
 
 import clickhouse_connect
@@ -39,22 +40,15 @@ def test_clickhouse(remote_pg: RemotePostgres):
     """
     Test the logical replication having ClickHouse as a client
     """
+    clickhouse_host = "clickhouse" if ("CI" in os.environ) else "127.0.0.1"
     conn_options = remote_pg.conn_options()
-    for _ in range(5):
-        try:
-            conn = psycopg2.connect(remote_pg.connstr())
-        except psycopg2.OperationalError as perr:
-            log.debug(perr)
-            time.sleep(1)
-        else:
-            break
-        raise TimeoutError
+    conn = psycopg2.connect(remote_pg.connstr())
     cur = conn.cursor()
     cur.execute("DROP TABLE IF EXISTS table1")
     cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));")
     cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');")
     conn.commit()
-    client = clickhouse_connect.get_client(host="clickhouse")
+    client = clickhouse_connect.get_client(host=clickhouse_host)
     client.command("SET allow_experimental_database_materialized_postgresql=1")
     client.command(
         "CREATE DATABASE db1_postgres ENGINE = "
diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py
new file mode 100644
index 0000000000..700b731418
--- /dev/null
+++ b/test_runner/logical_repl/test_debezium.py
@@ -0,0 +1,189 @@
+"""
+Test the logical replication in Neon with Debezium as a consumer
+"""
+
+import json
+import os
+import time
+
+import psycopg2
+import pytest
+import requests
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import RemotePostgres
+from fixtures.utils import wait_until
+from kafka import KafkaConsumer
+
+
+class DebeziumAPI:
+    """
+    The class for Debezium API calls
+    """
+
+    def __init__(self):
+        self.__host = "debezium" if ("CI" in os.environ) else "127.0.0.1"
+        self.__base_url = f"http://{self.__host}:8083"
+        self.__connectors_url = f"{self.__base_url}/connectors"
+
+    def __request(self, method, addurl="", **kwargs):
+        return requests.request(
+            method,
+            self.__connectors_url + addurl,
+            headers={"Accept": "application/json", "Content-type": "application/json"},
+            timeout=60,
+            **kwargs,
+        )
+
+    def create_pg_connector(self, remote_pg: RemotePostgres, dbz_conn_name: str):
+        """
+        Create a Postgres connector in debezium
+        """
+        conn_options = remote_pg.conn_options()
+        payload = {
+            "name": dbz_conn_name,
+            "config": {
+                "connector.class": "io.debezium.connector.postgresql.PostgresConnector",
+                "tasks.max": "1",
+                "database.hostname": conn_options["host"],
+                "database.port": "5432",
+                "database.user": conn_options["user"],
+                "database.password": conn_options["password"],
+                "database.dbname": conn_options["dbname"],
+                "plugin.name": "pgoutput",
+                "topic.prefix": "dbserver1",
+                "schema.include.list": "inventory",
+            },
+        }
+        return self.__request("POST", json=payload)
+
+    def list_connectors(self):
+        """
+        Returns a list of all connectors existent in Debezium.
+        """
+        resp = self.__request("GET")
+        assert resp.ok
+        return json.loads(resp.text)
+
+    def del_connector(self, connector):
+        """
+        Deletes the specified connector
+        """
+        return self.__request("DELETE", f"/{connector}")
+
+
+@pytest.fixture(scope="function")
+def debezium(remote_pg: RemotePostgres):
+    """
+    Prepare the Debezium API handler, connection
+    """
+    conn = psycopg2.connect(remote_pg.connstr())
+    cur = conn.cursor()
+    cur.execute("DROP SCHEMA IF EXISTS inventory CASCADE")
+    cur.execute("CREATE SCHEMA inventory")
+    cur.execute(
+        "CREATE TABLE inventory.customers ("
+        "id SERIAL NOT NULL PRIMARY KEY,"
+        "first_name character varying(255) NOT NULL,"
+        "last_name character varying(255) NOT NULL,"
+        "email character varying(255) NOT NULL)"
+    )
+    conn.commit()
+    dbz = DebeziumAPI()
+    assert len(dbz.list_connectors()) == 0
+    dbz_conn_name = "inventory-connector"
+    resp = dbz.create_pg_connector(remote_pg, dbz_conn_name)
+    log.debug("%s %s %s", resp.status_code, resp.ok, resp.text)
+    assert resp.status_code == 201
+    assert len(dbz.list_connectors()) == 1
+    consumer = KafkaConsumer(
+        "dbserver1.inventory.customers",
+        bootstrap_servers=["kafka:9092"],
+        auto_offset_reset="earliest",
+        enable_auto_commit=False,
+    )
+    yield conn, consumer
+    resp = dbz.del_connector(dbz_conn_name)
+    assert resp.status_code == 204
+
+
+def get_kafka_msg(consumer, ts_ms, before=None, after=None) -> None:
+    """
+    Gets the message from Kafka and checks its validity
+    Arguments:
+        consumer: the consumer object
+        ts_ms:    timestamp in milliseconds of the change of db, the corresponding message must have
+                  the later timestamp
+        before:   a dictionary, if not None, the before field from the kafka message must
+                  have the same values for the same keys
+        after:    a dictionary, if not None, the after field from the kafka message must
+                  have the same values for the same keys
+    """
+    msg = consumer.poll()
+    assert msg, "Empty message"
+    for val in msg.values():
+        r = json.loads(val[-1].value)
+        log.info(r["payload"])
+        assert ts_ms < r["payload"]["ts_ms"], "Incorrect timestamp"
+        for param, pname in ((before, "before"), (after, "after")):
+            if param is not None:
+                for k, v in param.items():
+                    assert r["payload"][pname][k] == v, f"{pname} mismatches"
+
+
+@pytest.mark.remote_cluster
+def test_debezium(debezium):
+    """
+    Test the logical replication having Debezium as a subscriber
+    """
+    conn, consumer = debezium
+    cur = conn.cursor()
+    ts_ms = time.time() * 1000
+    log.info("Insert 1 ts_ms: %s", ts_ms)
+    cur.execute(
+        "insert into inventory.customers (first_name, last_name, email) "
+        "values ('John', 'Dow','johndow@example.com')"
+    )
+    conn.commit()
+    wait_until(
+        100,
+        0.5,
+        lambda: get_kafka_msg(
+            consumer,
+            ts_ms,
+            after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"},
+        ),
+        show_intermediate_error=True,
+    )
+    ts_ms = time.time() * 1000
+    log.info("Insert 2 ts_ms: %s", ts_ms)
+    cur.execute(
+        "insert into inventory.customers (first_name, last_name, email) "
+        "values ('Alex', 'Row','alexrow@example.com')"
+    )
+    conn.commit()
+    wait_until(
+        100,
+        0.5,
+        lambda: get_kafka_msg(
+            consumer,
+            ts_ms,
+            after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"},
+        ),
+        show_intermediate_error=True,
+    )
+    ts_ms = time.time() * 1000
+    log.info("Update ts_ms: %s", ts_ms)
+    cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2")
+    conn.commit()
+    wait_until(
+        100,
+        0.5,
+        lambda: get_kafka_msg(
+            consumer,
+            ts_ms,
+            after={"first_name": "Alexander"},
+        ),
+        show_intermediate_error=True,
+    )
+    time.sleep(3)
+    cur.execute("select 1")

From 0be952fb8904c9ad41d350cab8aa846ba1fde411 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 6 Aug 2024 13:56:42 +0200
Subject: [PATCH 16/48] enable rum test (#8380)

## Problem
We need to test the rum extension automatically as a path of the GitHub
workflow

## Summary of changes

rum test is enabled
---
 Dockerfile.compute-node               |  6 ++++--
 docker-compose/docker_compose_test.sh |  2 +-
 docker-compose/run-tests.sh           | 10 +++++-----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 5e53a55316..054d44e0ec 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -933,7 +933,8 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
 #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
 COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
 COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
-#COPY --from=rum-pg-build /rum.tar.gz /ext-src
+COPY --from=rum-pg-build /rum.tar.gz /ext-src
+COPY patches/rum.patch /ext-src
 #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
 COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
 COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
@@ -945,7 +946,7 @@ COPY patches/pg_hintplan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
-COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
+#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
 COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
 COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
@@ -960,6 +961,7 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
     rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
     || exit 1; rm -f $f; done
 RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
+RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
 RUN patch -p1 < /ext-src/pg_hintplan.patch
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index a00591afd0..10805a9952 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -78,7 +78,7 @@ for pg_version in 14 15 16; do
         docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
         rm -rf $TMPDIR
         # We are running tests now
-        if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+        if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
             $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
         then
             cleanup
diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh
index c05fc159aa..58b2581197 100644
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,15 +1,15 @@
 #!/bin/bash
 set -x
 
-cd /ext-src
+cd /ext-src || exit 2
 FAILED=
-LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
+LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
 for d in ${LIST}
 do
-       [ -d ${d} ] || continue
+       [ -d "${d}" ] || continue
     psql -c "select 1" >/dev/null || break
-       make -C ${d} installcheck || FAILED="${d} ${FAILED}"
+       USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
 done
 [ -z "${FAILED}" ] && exit 0
-echo ${FAILED}
+echo "${FAILED}"
 exit 1
\ No newline at end of file

From 8d8e428d4ccd8b6e427c0b9e321c05abcc8bc296 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Aug 2024 12:58:33 +0100
Subject: [PATCH 17/48] tests: improve stability of
 `test_storage_controller_many_tenants` (#8607)

## Problem

The controller scale test does random migrations. These mutate secondary
locations, and therefore can cause secondary optimizations to happen in
the background, violating the test's expectation that consistency_check
will work as there are no reconciliations running.

Example:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/10247161379/index.html#suites/07874de07c4a1c9effe0d92da7755ebf/6316beacd3fb3060/

## Summary of changes

- Only migrate to existing secondary locations, not randomly picked
nodes, so that we can do a fast reconcile_until_idle (otherwise
reconcile_until_idle is takes a long time to create new secondary
locations).
- Do a reconcile_until_idle before consistency_check.
---
 .../performance/test_storage_controller_scale.py     | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 281c9271e9..04785f7184 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -217,7 +217,11 @@ def test_storage_controller_many_tenants(
                 # A reconciler operation: migrate a shard.
                 shard_number = rng.randint(0, shard_count - 1)
                 tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
-                dest_ps_id = rng.choice([ps.id for ps in env.pageservers])
+
+                # Migrate it to its secondary location
+                desc = env.storage_controller.tenant_describe(tenant_id)
+                dest_ps_id = desc["shards"][shard_number]["node_secondary"][0]
+
                 f = executor.submit(
                     env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
                 )
@@ -231,7 +235,11 @@ def test_storage_controller_many_tenants(
         for f in futs:
             f.result()
 
-    # Consistency check is safe here: all the previous operations waited for reconcile before completing
+    # Some of the operations above (notably migrations) might leave the controller in a state where it has
+    # some work to do, for example optimizing shard placement after we do a random migration. Wait for the system
+    # to reach a quiescent state before doing following checks.
+    env.storage_controller.reconcile_until_idle()
+
     env.storage_controller.consistency_check()
     check_memory()
 

From ea7be4152aeda69bd682359e7777df4662d59d0a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Aug 2024 14:47:01 +0100
Subject: [PATCH 18/48] pageserver: fixes for layer visibility metric (#8603)

## Problem

In staging, we could see that occasionally tenants were wrapping their
pageserver_visible_physical_size metric past zero to 2^64.

This is harmless right now, but will matter more later when we start
using visible size in things like the /utilization endpoint.

## Summary of changes

- Add debug asserts that detect this case. `test_gc_of_remote_layers`
works as a reproducer for this issue once the asserts are added.
- Tighten up the interface around access_stats so that only Layer can
mutate it.
- In Layer, wrap calls to `record_access` in code that will update the
visible size statistic if the access implicitly marks the layer visible
(this was what caused the bug)
- In LayerManager::rewrite_layers, use the proper set_visibility layer
function instead of directly using access_stats (this is an additional
path where metrics could go bad.)
- Removed unused instances of LayerAccessStats in DeltaLayer and
ImageLayer which I noticed while reviewing the code paths that call
record_access.
---
 pageserver/src/tenant/storage_layer.rs        | 14 +++++---
 .../src/tenant/storage_layer/delta_layer.rs   |  8 +----
 .../src/tenant/storage_layer/image_layer.rs   |  4 ---
 pageserver/src/tenant/storage_layer/layer.rs  | 35 ++++++++++++++++---
 pageserver/src/tenant/timeline.rs             |  4 +--
 .../src/tenant/timeline/eviction_task.rs      |  2 +-
 .../src/tenant/timeline/layer_manager.rs      |  7 ++--
 7 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index ab32a6035e..04f89db401 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -539,19 +539,25 @@ impl LayerAccessStats {
         self.record_residence_event_at(SystemTime::now())
     }
 
-    pub(crate) fn record_access_at(&self, now: SystemTime) {
+    fn record_access_at(&self, now: SystemTime) -> bool {
         let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);
 
         // A layer which is accessed must be visible.
         mask |= 0x1 << Self::VISIBILITY_SHIFT;
         value |= 0x1 << Self::VISIBILITY_SHIFT;
 
-        self.write_bits(mask, value);
+        let old_bits = self.write_bits(mask, value);
+        !matches!(
+            self.decode_visibility(old_bits),
+            LayerVisibilityHint::Visible
+        )
     }
 
-    pub(crate) fn record_access(&self, ctx: &RequestContext) {
+    /// Returns true if we modified the layer's visibility to set it to Visible implicitly
+    /// as a result of this access
+    pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool {
         if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
-            return;
+            return false;
         }
 
         self.record_access_at(SystemTime::now())
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index a17dd28547..962faa6796 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -72,10 +72,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::{
-    AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
-    ValuesReconstructState,
-};
+use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
 
 ///
 /// Header stored in the beginning of the file
@@ -200,7 +197,6 @@ impl DeltaKey {
 pub struct DeltaLayer {
     path: Utf8PathBuf,
     pub desc: PersistentLayerDesc,
-    access_stats: LayerAccessStats,
     inner: OnceCell<Arc<DeltaLayerInner>>,
 }
 
@@ -299,7 +295,6 @@ impl DeltaLayer {
     /// not loaded already.
     ///
     async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
-        self.access_stats.record_access(ctx);
         // Quick exit if already loaded
         self.inner
             .get_or_try_init(|| self.load_inner(ctx))
@@ -350,7 +345,6 @@ impl DeltaLayer {
                 summary.lsn_range,
                 metadata.len(),
             ),
-            access_stats: Default::default(),
             inner: OnceCell::new(),
         })
     }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index b2173455ab..16ba0fda94 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -32,7 +32,6 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::LayerAccessStats;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -135,7 +134,6 @@ pub struct ImageLayer {
     pub desc: PersistentLayerDesc,
     // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
     pub lsn: Lsn,
-    access_stats: LayerAccessStats,
     inner: OnceCell<ImageLayerInner>,
 }
 
@@ -253,7 +251,6 @@ impl ImageLayer {
     /// not loaded already.
     ///
     async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
-        self.access_stats.record_access(ctx);
         self.inner
             .get_or_try_init(|| self.load_inner(ctx))
             .await
@@ -304,7 +301,6 @@ impl ImageLayer {
                 metadata.len(),
             ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
             lsn: summary.lsn,
-            access_stats: Default::default(),
             inner: OnceCell::new(),
         })
     }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index cee2fe7342..83450d24bb 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -316,7 +316,7 @@ impl Layer {
                 other => GetVectoredError::Other(anyhow::anyhow!(other)),
             })?;
 
-        self.0.access_stats.record_access(ctx);
+        self.record_access(ctx);
 
         layer
             .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
@@ -396,8 +396,12 @@ impl Layer {
         self.0.info(reset)
     }
 
-    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
-        &self.0.access_stats
+    pub(crate) fn latest_activity(&self) -> SystemTime {
+        self.0.access_stats.latest_activity()
+    }
+
+    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
+        self.0.access_stats.visibility()
     }
 
     pub(crate) fn local_path(&self) -> &Utf8Path {
@@ -447,13 +451,31 @@ impl Layer {
         }
     }
 
+    fn record_access(&self, ctx: &RequestContext) {
+        if self.0.access_stats.record_access(ctx) {
+            // Visibility was modified to Visible
+            tracing::info!(
+                "Layer {} became visible as a result of access",
+                self.0.desc.key()
+            );
+            if let Some(tl) = self.0.timeline.upgrade() {
+                tl.metrics
+                    .visible_physical_size_gauge
+                    .add(self.0.desc.file_size)
+            }
+        }
+    }
+
     pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
-        let old_visibility = self.access_stats().set_visibility(visibility.clone());
+        let old_visibility = self.0.access_stats.set_visibility(visibility.clone());
         use LayerVisibilityHint::*;
         match (old_visibility, visibility) {
             (Visible, Covered) => {
                 // Subtract this layer's contribution to the visible size metric
                 if let Some(tl) = self.0.timeline.upgrade() {
+                    debug_assert!(
+                        tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
+                    );
                     tl.metrics
                         .visible_physical_size_gauge
                         .sub(self.0.desc.file_size)
@@ -671,6 +693,9 @@ impl Drop for LayerInner {
             }
 
             if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
+                debug_assert!(
+                    timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
+                );
                 timeline
                     .metrics
                     .visible_physical_size_gauge
@@ -1810,7 +1835,7 @@ impl ResidentLayer {
                 // this is valid because the DownloadedLayer::kind is a OnceCell, not a
                 // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
                 // while it's being held.
-                owner.access_stats.record_access(ctx);
+                self.owner.record_access(ctx);
 
                 delta_layer::DeltaLayerInner::load_keys(d, ctx)
                     .await
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5a02fd4a4c..6c67fb9cb6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2920,7 +2920,7 @@ impl Timeline {
         let guard = self.layers.read().await;
 
         let resident = guard.likely_resident_layers().map(|layer| {
-            let last_activity_ts = layer.access_stats().latest_activity();
+            let last_activity_ts = layer.latest_activity();
 
             HeatMapLayer::new(
                 layer.layer_desc().layer_name(),
@@ -5182,7 +5182,7 @@ impl Timeline {
                 let file_size = layer.layer_desc().file_size;
                 max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
 
-                let last_activity_ts = layer.access_stats().latest_activity();
+                let last_activity_ts = layer.latest_activity();
 
                 EvictionCandidate {
                     layer: layer.into(),
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index fec66aabc1..1ba1bf9de5 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -225,7 +225,7 @@ impl Timeline {
                     continue;
                 }
 
-                let last_activity_ts = layer.access_stats().latest_activity();
+                let last_activity_ts = layer.latest_activity();
 
                 let no_activity_for = match now.duration_since(last_activity_ts) {
                     Ok(d) => d,
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 1bc2acbd34..e6e7bc2e77 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -259,13 +259,10 @@ impl LayerManager {
                 new_layer.layer_desc().lsn_range
             );
 
-            // Transfer visibilty hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
+            // Transfer visibility hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
             // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
             // always marking rewritten layers as visible.
-            new_layer
-                .as_ref()
-                .access_stats()
-                .set_visibility(old_layer.access_stats().visibility());
+            new_layer.as_ref().set_visibility(old_layer.visibility());
 
             // Safety: we may never rewrite the same file in-place.  Callers are responsible
             // for ensuring that they only rewrite layers after something changes the path,

From 6519f875b99c50ddc08ece91eb553fc92c1bb8bf Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Aug 2024 17:15:40 +0100
Subject: [PATCH 19/48] pageserver: use layer visibility when composing heatmap
 (#8616)

## Problem

Sometimes, a layer is Covered by hasn't yet been evicted from local disk
(e.g. shortly after image layer generation). It is not good use of
resources to download these to a secondary location, as there's a good
chance they will never be read.

This follows the previous change that added layer visibility:
- #8511

Part of epic:
- https://github.com/neondatabase/neon/issues/8398

## Summary of changes

- When generating heatmaps, only include Visible layers
- Update test_secondary_downloads to filter to visible layers when
listing layers from an attached location
---
 pageserver/src/tenant/timeline.rs             | 26 ++++++----
 test_runner/fixtures/pageserver/http.py       |  2 +
 .../regress/test_pageserver_secondary.py      | 52 +++++++++++++++----
 3 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6c67fb9cb6..4ff87f20f1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -137,7 +137,7 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::{config::TenantConf, upload_queue::NotInitialized};
+use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{
@@ -2919,14 +2919,22 @@ impl Timeline {
 
         let guard = self.layers.read().await;
 
-        let resident = guard.likely_resident_layers().map(|layer| {
-            let last_activity_ts = layer.latest_activity();
-
-            HeatMapLayer::new(
-                layer.layer_desc().layer_name(),
-                layer.metadata(),
-                last_activity_ts,
-            )
+        let resident = guard.likely_resident_layers().filter_map(|layer| {
+            match layer.visibility() {
+                LayerVisibilityHint::Visible => {
+                    // Layer is visible to one or more read LSNs: elegible for inclusion in layer map
+                    let last_activity_ts = layer.latest_activity();
+                    Some(HeatMapLayer::new(
+                        layer.layer_desc().layer_name(),
+                        layer.metadata(),
+                        last_activity_ts,
+                    ))
+                }
+                LayerVisibilityHint::Covered => {
+                    // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
+                    None
+                }
+            }
         });
 
         let layers = resident.collect();
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 61e2204b23..5be59d3749 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -61,6 +61,7 @@ class HistoricLayerInfo:
     remote: bool
     # None for image layers, true if pageserver thinks this is an L0 delta layer
     l0: Optional[bool]
+    visible: bool
 
     @classmethod
     def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
@@ -79,6 +80,7 @@ class HistoricLayerInfo:
             lsn_end=d.get("lsn_end"),
             remote=d["remote"],
             l0=l0_ness,
+            visible=d["access_stats"]["visible"],
         )
 
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 53f69b5b26..4b0af24480 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -2,10 +2,11 @@ import json
 import os
 import random
 import time
-from typing import Any, Dict, Optional
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
 
 import pytest
-from fixtures.common_types import TenantId, TimelineId
+from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
 from fixtures.pageserver.common_types import parse_layer_file_name
@@ -437,6 +438,35 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
     validate_heatmap(heatmap_second)
 
 
+def list_elegible_layers(
+    pageserver, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+) -> list[Path]:
+    """
+    The subset of layer filenames that are elegible for secondary download: at time of writing this
+    is all resident layers which are also visible.
+    """
+    candidates = pageserver.list_layers(tenant_id, timeline_id)
+
+    layer_map = pageserver.http_client().layer_map_info(tenant_id, timeline_id)
+
+    # Map of layer filenames to their visibility the "layer name" is not the same as the filename: add suffix to resolve one to the other
+    visible_map = dict(
+        (f"{layer.layer_file_name}-v1-00000001", layer.visible)
+        for layer in layer_map.historic_layers
+    )
+
+    def is_visible(layer_file_name):
+        try:
+            return visible_map[str(layer_file_name)]
+        except KeyError:
+            # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map
+            # matches what's on disk.
+            log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
+            raise
+
+    return list(c for c in candidates if is_visible(c))
+
+
 def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     """
     Test the overall data flow in secondary mode:
@@ -491,7 +521,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
 
     ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
-    assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
+    assert list_elegible_layers(ps_attached, tenant_id, timeline_id) == ps_secondary.list_layers(
         tenant_id, timeline_id
     )
 
@@ -509,9 +539,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
     try:
-        assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
-            tenant_id, timeline_id
-        )
+        assert list_elegible_layers(
+            ps_attached, tenant_id, timeline_id
+        ) == ps_secondary.list_layers(tenant_id, timeline_id)
     except:
         # Do a full listing of the secondary location on errors, to help debug of
         # https://github.com/neondatabase/neon/issues/6966
@@ -532,8 +562,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     # ==================================================================
     try:
         log.info("Evicting a layer...")
-        layer_to_evict = ps_attached.list_layers(tenant_id, timeline_id)[0]
-        some_other_layer = ps_attached.list_layers(tenant_id, timeline_id)[1]
+        layer_to_evict = list_elegible_layers(ps_attached, tenant_id, timeline_id)[0]
+        some_other_layer = list_elegible_layers(ps_attached, tenant_id, timeline_id)[1]
         log.info(f"Victim layer: {layer_to_evict.name}")
         ps_attached.http_client().evict_layer(
             tenant_id, timeline_id, layer_name=layer_to_evict.name
@@ -551,9 +581,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
         ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
         assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id)
-        assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
-            tenant_id, timeline_id
-        )
+        assert list_elegible_layers(
+            ps_attached, tenant_id, timeline_id
+        ) == ps_secondary.list_layers(tenant_id, timeline_id)
     except:
         # On assertion failures, log some details to help with debugging
         heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)

From ba4e5b51a0eba308630312cfa8fa1ac306e19290 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Aug 2024 17:39:40 +0100
Subject: [PATCH 20/48] pageserver: add `bench_ingest` (#7409)

## Problem

We lack a rust bench for the inmemory layer and delta layer write paths:
it is useful to benchmark these components independent of postgres & WAL
decoding.

Related: https://github.com/neondatabase/neon/issues/8452

## Summary of changes

- Refactor DeltaLayerWriter to avoid carrying a Timeline, so that it can
be cleanly tested + benched without a Tenant/Timeline test harness. It
only needed the Timeline for building `Layer`, so this can be done in a
separate step.
- Add `bench_ingest`, which exercises a variety of workload "shapes"
(big values, small values, sequential keys, random keys)
- Include a small uncontroversial optimization: in `freeze`, only
exhaustively walk values to assert ordering relative to end_lsn in debug
mode.

These benches are limited by drive performance on a lot of machines, but
still useful as a local tool for iterating on CPU/memory improvements
around this code path.

Anecdotal measurements on Hetzner AX102 (Ryzen 7950xd):

```

ingest-small-values/ingest 128MB/100b seq
                        time:   [1.1160 s 1.1230 s 1.1289 s]
                        thrpt:  [113.38 MiB/s 113.98 MiB/s 114.70 MiB/s]
Found 1 outliers among 10 measurements (10.00%)
  1 (10.00%) low mild
Benchmarking ingest-small-values/ingest 128MB/100b rand: Warming up for 3.0000 s
Warning: Unable to complete 10 samples in 10.0s. You may wish to increase target time to 18.9s.
ingest-small-values/ingest 128MB/100b rand
                        time:   [1.9001 s 1.9056 s 1.9110 s]
                        thrpt:  [66.982 MiB/s 67.171 MiB/s 67.365 MiB/s]
Benchmarking ingest-small-values/ingest 128MB/100b rand-1024keys: Warming up for 3.0000 s
Warning: Unable to complete 10 samples in 10.0s. You may wish to increase target time to 11.0s.
ingest-small-values/ingest 128MB/100b rand-1024keys
                        time:   [1.0715 s 1.0828 s 1.0937 s]
                        thrpt:  [117.04 MiB/s 118.21 MiB/s 119.46 MiB/s]
ingest-small-values/ingest 128MB/100b seq, no delta
                        time:   [425.49 ms 429.07 ms 432.04 ms]
                        thrpt:  [296.27 MiB/s 298.32 MiB/s 300.83 MiB/s]
Found 1 outliers among 10 measurements (10.00%)
  1 (10.00%) low mild

ingest-big-values/ingest 128MB/8k seq
                        time:   [373.03 ms 375.84 ms 379.17 ms]
                        thrpt:  [337.58 MiB/s 340.57 MiB/s 343.13 MiB/s]
Found 1 outliers among 10 measurements (10.00%)
  1 (10.00%) high mild
ingest-big-values/ingest 128MB/8k seq, no delta
                        time:   [81.534 ms 82.811 ms 83.364 ms]
                        thrpt:  [1.4994 GiB/s 1.5095 GiB/s 1.5331 GiB/s]
Found 1 outliers among 10 measurements (10.00%)


```
---
 pageserver/Cargo.toml                         |   4 +
 pageserver/benches/bench_ingest.rs            | 235 ++++++++++++++++++
 pageserver/src/l0_flush.rs                    |   4 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  45 ++--
 .../tenant/storage_layer/inmemory_layer.rs    |  37 +--
 .../src/tenant/storage_layer/split_writer.rs  |  11 +-
 pageserver/src/tenant/timeline.rs             |  11 +-
 pageserver/src/tenant/timeline/compaction.rs  |  44 ++--
 .../src/tenant/timeline/detach_ancestor.rs    |   6 +-
 9 files changed, 322 insertions(+), 75 deletions(-)
 create mode 100644 pageserver/benches/bench_ingest.rs

diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 43976250a4..0e748ee3db 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -108,3 +108,7 @@ harness = false
 [[bench]]
 name = "bench_walredo"
 harness = false
+
+[[bench]]
+name = "bench_ingest"
+harness = false
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
new file mode 100644
index 0000000000..af2b6934c6
--- /dev/null
+++ b/pageserver/benches/bench_ingest.rs
@@ -0,0 +1,235 @@
+use std::{env, num::NonZeroUsize};
+
+use bytes::Bytes;
+use camino::Utf8PathBuf;
+use criterion::{criterion_group, criterion_main, Criterion};
+use pageserver::{
+    config::PageServerConf,
+    context::{DownloadBehavior, RequestContext},
+    l0_flush::{L0FlushConfig, L0FlushGlobalState},
+    page_cache,
+    repository::Value,
+    task_mgr::TaskKind,
+    tenant::storage_layer::InMemoryLayer,
+    virtual_file::{self, api::IoEngineKind},
+};
+use pageserver_api::{key::Key, shard::TenantShardId};
+use utils::{
+    bin_ser::BeSer,
+    id::{TenantId, TimelineId},
+};
+
+// A very cheap hash for generating non-sequential keys.
+fn murmurhash32(mut h: u32) -> u32 {
+    h ^= h >> 16;
+    h = h.wrapping_mul(0x85ebca6b);
+    h ^= h >> 13;
+    h = h.wrapping_mul(0xc2b2ae35);
+    h ^= h >> 16;
+    h
+}
+
+enum KeyLayout {
+    /// Sequential unique keys
+    Sequential,
+    /// Random unique keys
+    Random,
+    /// Random keys, but only use the bits from the mask of them
+    RandomReuse(u32),
+}
+
+enum WriteDelta {
+    Yes,
+    No,
+}
+
+async fn ingest(
+    conf: &'static PageServerConf,
+    put_size: usize,
+    put_count: usize,
+    key_layout: KeyLayout,
+    write_delta: WriteDelta,
+) -> anyhow::Result<()> {
+    let mut lsn = utils::lsn::Lsn(1000);
+    let mut key = Key::from_i128(0x0);
+
+    let timeline_id = TimelineId::generate();
+    let tenant_id = TenantId::generate();
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?;
+
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
+
+    let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &ctx).await?;
+
+    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
+    let ctx = RequestContext::new(
+        pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
+        pageserver::context::DownloadBehavior::Download,
+    );
+
+    for i in 0..put_count {
+        lsn += put_size as u64;
+
+        // Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people
+        // usually care the most about write performance when they're blasting a huge batch of data into a huge table.
+        match key_layout {
+            KeyLayout::Sequential => {
+                // Use sequential order to illustrate the experience a user is likely to have
+                // when ingesting bulk data.
+                key.field6 = i as u32;
+            }
+            KeyLayout::Random => {
+                // Use random-order keys to avoid giving a false advantage to data structures that are
+                // faster when inserting on the end.
+                key.field6 = murmurhash32(i as u32);
+            }
+            KeyLayout::RandomReuse(mask) => {
+                // Use low bits only, to limit cardinality
+                key.field6 = murmurhash32(i as u32) & mask;
+            }
+        }
+
+        layer.put_value(key, lsn, &data, &ctx).await?;
+    }
+    layer.freeze(lsn + 1).await;
+
+    if matches!(write_delta, WriteDelta::Yes) {
+        let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
+            max_concurrency: NonZeroUsize::new(1).unwrap(),
+        });
+        let (_desc, path) = layer
+            .write_to_disk(&ctx, None, l0_flush_state.inner())
+            .await?
+            .unwrap();
+        tokio::fs::remove_file(path).await?;
+    }
+
+    Ok(())
+}
+
+/// Wrapper to instantiate a tokio runtime
+fn ingest_main(
+    conf: &'static PageServerConf,
+    put_size: usize,
+    put_count: usize,
+    key_layout: KeyLayout,
+    write_delta: WriteDelta,
+) {
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    runtime.block_on(async move {
+        let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
+        if let Err(e) = r {
+            panic!("{e:?}");
+        }
+    });
+}
+
+/// Declare a series of benchmarks for the Pageserver's ingest write path.
+///
+/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either
+/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set).
+///
+/// Genuine disk I/O is used, so expect results to differ depending on storage.  However, when running on
+/// a fast disk, CPU is the bottleneck at time of writing.
+fn criterion_benchmark(c: &mut Criterion) {
+    let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap();
+    let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap();
+    eprintln!("Data directory: {}", temp_dir.path());
+
+    let conf: &'static PageServerConf = Box::leak(Box::new(
+        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
+    ));
+    virtual_file::init(16384, IoEngineKind::TokioEpollUring);
+    page_cache::init(conf.page_cache_size);
+
+    {
+        let mut group = c.benchmark_group("ingest-small-values");
+        let put_size = 100usize;
+        let put_count = 128 * 1024 * 1024 / put_size;
+        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
+        group.sample_size(10);
+        group.bench_function("ingest 128MB/100b seq", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/100b rand", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Random,
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::RandomReuse(0x3ff),
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/100b seq, no delta", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::No,
+                )
+            })
+        });
+    }
+
+    {
+        let mut group = c.benchmark_group("ingest-big-values");
+        let put_size = 8192usize;
+        let put_count = 128 * 1024 * 1024 / put_size;
+        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
+        group.sample_size(10);
+        group.bench_function("ingest 128MB/8k seq", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/8k seq, no delta", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::No,
+                )
+            })
+        });
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs
index 8945e5accd..10187f2ba3 100644
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -24,7 +24,7 @@ impl Default for L0FlushConfig {
 #[derive(Clone)]
 pub struct L0FlushGlobalState(Arc<Inner>);
 
-pub(crate) enum Inner {
+pub enum Inner {
     PageCached,
     Direct { semaphore: tokio::sync::Semaphore },
 }
@@ -40,7 +40,7 @@ impl L0FlushGlobalState {
         }
     }
 
-    pub(crate) fn inner(&self) -> &Arc<Inner> {
+    pub fn inner(&self) -> &Arc<Inner> {
         &self.0
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 962faa6796..bff8f7cb24 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,13 +36,12 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::Layer;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
     VectoredReadPlanner,
 };
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::PageReconstructError;
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
@@ -72,7 +71,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
+use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
 
 ///
 /// Header stored in the beginning of the file
@@ -367,7 +366,6 @@ impl DeltaLayer {
 /// 3. Call `finish`.
 ///
 struct DeltaLayerWriterInner {
-    conf: &'static PageServerConf,
     pub path: Utf8PathBuf,
     timeline_id: TimelineId,
     tenant_shard_id: TenantShardId,
@@ -414,7 +412,6 @@ impl DeltaLayerWriterInner {
         let tree_builder = DiskBtreeBuilder::new(block_buf);
 
         Ok(Self {
-            conf,
             path,
             timeline_id,
             tenant_shard_id,
@@ -489,11 +486,10 @@ impl DeltaLayerWriterInner {
     async fn finish(
         self,
         key_end: Key,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
         let temp_path = self.path.clone();
-        let result = self.finish0(key_end, timeline, ctx).await;
+        let result = self.finish0(key_end, ctx).await;
         if result.is_err() {
             tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
             if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -506,9 +502,8 @@ impl DeltaLayerWriterInner {
     async fn finish0(
         self,
         key_end: Key,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
 
@@ -573,11 +568,9 @@ impl DeltaLayerWriterInner {
         // fsync the file
         file.sync_all().await?;
 
-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+        trace!("created delta layer {}", self.path);
 
-        trace!("created delta layer {}", layer.local_path());
-
-        Ok(layer)
+        Ok((desc, self.path))
     }
 }
 
@@ -678,14 +671,9 @@ impl DeltaLayerWriter {
     pub(crate) async fn finish(
         mut self,
         key_end: Key,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(key_end, timeline, ctx)
-            .await
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+        self.inner.take().unwrap().finish(key_end, ctx).await
     }
 
     #[cfg(test)]
@@ -1592,8 +1580,9 @@ pub(crate) mod test {
     use super::*;
     use crate::repository::Value;
     use crate::tenant::harness::TIMELINE_ID;
+    use crate::tenant::storage_layer::{Layer, ResidentLayer};
     use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::Tenant;
+    use crate::tenant::{Tenant, Timeline};
     use crate::{
         context::DownloadBehavior,
         task_mgr::TaskKind,
@@ -1887,9 +1876,8 @@ pub(crate) mod test {
             res?;
         }
 
-        let resident = writer
-            .finish(entries_meta.key_range.end, &timeline, &ctx)
-            .await?;
+        let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?;
+        let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?;
 
         let inner = resident.get_as_delta(&ctx).await?;
 
@@ -2078,7 +2066,8 @@ pub(crate) mod test {
                 .await
                 .unwrap();
 
-            let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
+            let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap();
+            let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap();
 
             copied_layer.get_as_delta(ctx).await.unwrap();
 
@@ -2206,7 +2195,9 @@ pub(crate) mod test {
         for (key, lsn, value) in deltas {
             writer.put_value(key, lsn, value, ctx).await?;
         }
-        let delta_layer = writer.finish(key_end, tline, ctx).await?;
+
+        let (desc, path) = writer.finish(key_end, ctx).await?;
+        let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
 
         Ok::<_, anyhow::Error>(delta_layer)
     }
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 6abc89c2ed..f118f3d8d8 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -11,9 +11,10 @@ use crate::repository::{Key, Value};
 use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::PageReconstructError;
 use crate::{l0_flush, page_cache, walrecord};
 use anyhow::{anyhow, Result};
+use camino::Utf8PathBuf;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -32,7 +33,9 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::{RwLock, RwLockWriteGuard};
 
-use super::{DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValuesReconstructState};
+use super::{
+    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
+};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
 pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
@@ -410,8 +413,7 @@ impl InMemoryLayer {
 
     /// Common subroutine of the public put_wal_record() and put_page_image() functions.
     /// Adds the page version to the in-memory tree
-
-    pub(crate) async fn put_value(
+    pub async fn put_value(
         &self,
         key: Key,
         lsn: Lsn,
@@ -476,8 +478,6 @@ impl InMemoryLayer {
     /// Records the end_lsn for non-dropped layers.
     /// `end_lsn` is exclusive
     pub async fn freeze(&self, end_lsn: Lsn) {
-        let inner = self.inner.write().await;
-
         assert!(
             self.start_lsn < end_lsn,
             "{} >= {}",
@@ -495,9 +495,13 @@ impl InMemoryLayer {
             })
             .expect("frozen_local_path_str set only once");
 
-        for vec_map in inner.index.values() {
-            for (lsn, _pos) in vec_map.as_slice() {
-                assert!(*lsn < end_lsn);
+        #[cfg(debug_assertions)]
+        {
+            let inner = self.inner.write().await;
+            for vec_map in inner.index.values() {
+                for (lsn, _pos) in vec_map.as_slice() {
+                    assert!(*lsn < end_lsn);
+                }
             }
         }
     }
@@ -507,12 +511,12 @@ impl InMemoryLayer {
     /// if there are no matching keys.
     ///
     /// Returns a new delta layer with all the same data as this in-memory layer
-    pub(crate) async fn write_to_disk(
+    pub async fn write_to_disk(
         &self,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
         key_range: Option<Range<Key>>,
-    ) -> Result<Option<ResidentLayer>> {
+        l0_flush_global_state: &l0_flush::Inner,
+    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
         // Grab the lock in read-mode. We hold it over the I/O, but because this
         // layer is not writeable anymore, no one should be trying to acquire the
         // write lock on it, so we shouldn't block anyone. There's one exception
@@ -524,9 +528,8 @@ impl InMemoryLayer {
         // rare though, so we just accept the potential latency hit for now.
         let inner = self.inner.read().await;
 
-        let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
         use l0_flush::Inner;
-        let _concurrency_permit = match &*l0_flush_global_state {
+        let _concurrency_permit = match l0_flush_global_state {
             Inner::PageCached => None,
             Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
         };
@@ -556,7 +559,7 @@ impl InMemoryLayer {
         )
         .await?;
 
-        match &*l0_flush_global_state {
+        match l0_flush_global_state {
             l0_flush::Inner::PageCached => {
                 let ctx = RequestContextBuilder::extend(ctx)
                     .page_content_kind(PageContentKind::InMemoryLayer)
@@ -621,7 +624,7 @@ impl InMemoryLayer {
         }
 
         // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
+        let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
 
         // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
         //
@@ -633,6 +636,6 @@ impl InMemoryLayer {
         // we dirtied when writing to the filesystem have been flushed and marked !dirty.
         drop(_concurrency_permit);
 
-        Ok(Some(delta_layer))
+        Ok(Some((desc, path)))
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index a966775f9e..d7bfe48c60 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -4,6 +4,7 @@ use bytes::Bytes;
 use pageserver_api::key::{Key, KEY_SIZE};
 use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
 
+use crate::tenant::storage_layer::Layer;
 use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
 
 use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
@@ -173,8 +174,9 @@ impl SplitDeltaLayerWriter {
             )
             .await?;
             let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
-            self.generated_layers
-                .push(prev_delta_writer.finish(key, tline, ctx).await?);
+            let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
+            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            self.generated_layers.push(delta_layer);
         }
         self.inner.put_value(key, lsn, val, ctx).await
     }
@@ -190,7 +192,10 @@ impl SplitDeltaLayerWriter {
             inner,
             ..
         } = self;
-        generated_layers.push(inner.finish(end_key, tline, ctx).await?);
+
+        let (desc, path) = inner.finish(end_key, ctx).await?;
+        let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+        generated_layers.push(delta_layer);
         Ok(generated_layers)
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4ff87f20f1..a05e4e0712 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3709,12 +3709,14 @@ impl Timeline {
         let frozen_layer = Arc::clone(frozen_layer);
         let ctx = ctx.attached_child();
         let work = async move {
-            let Some(new_delta) = frozen_layer
-                .write_to_disk(&self_clone, &ctx, key_range)
+            let Some((desc, path)) = frozen_layer
+                .write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner())
                 .await?
             else {
                 return Ok(None);
             };
+            let new_delta = Layer::finish_creating(self_clone.conf, &self_clone, desc, &path)?;
+
             // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
             // We just need to fsync the directory in which these inodes are linked,
             // which we know to be the timeline directory.
@@ -5347,9 +5349,8 @@ impl Timeline {
         for (key, lsn, val) in deltas.data {
             delta_layer_writer.put_value(key, lsn, val, ctx).await?;
         }
-        let delta_layer = delta_layer_writer
-            .finish(deltas.key_range.end, self, ctx)
-            .await?;
+        let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?;
+        let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
 
         {
             let mut guard = self.layers.write().await;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 1ff029a313..276d7b4967 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1104,14 +1104,16 @@ impl Timeline {
                         || contains_hole
                     {
                         // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(
-                            writer
-                                .take()
-                                .unwrap()
-                                .finish(prev_key.unwrap().next(), self, ctx)
-                                .await
-                                .map_err(CompactionError::Other)?,
-                        );
+                        let (desc, path) = writer
+                            .take()
+                            .unwrap()
+                            .finish(prev_key.unwrap().next(), ctx)
+                            .await
+                            .map_err(CompactionError::Other)?;
+                        let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
+                            .map_err(CompactionError::Other)?;
+
+                        new_layers.push(new_delta);
                         writer = None;
 
                         if contains_hole {
@@ -1174,12 +1176,13 @@ impl Timeline {
             prev_key = Some(key);
         }
         if let Some(writer) = writer {
-            new_layers.push(
-                writer
-                    .finish(prev_key.unwrap().next(), self, ctx)
-                    .await
-                    .map_err(CompactionError::Other)?,
-            );
+            let (desc, path) = writer
+                .finish(prev_key.unwrap().next(), ctx)
+                .await
+                .map_err(CompactionError::Other)?;
+            let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
+                .map_err(CompactionError::Other)?;
+            new_layers.push(new_delta);
         }
 
         // Sync layers
@@ -1966,13 +1969,16 @@ impl Timeline {
             for (key, lsn, val) in deltas {
                 delta_layer_writer.put_value(key, lsn, val, ctx).await?;
             }
+
             stats.produce_delta_layer(delta_layer_writer.size());
             if dry_run {
                 return Ok(None);
             }
-            let delta_layer = delta_layer_writer
-                .finish(delta_key.key_range.end, tline, ctx)
+
+            let (desc, path) = delta_layer_writer
+                .finish(delta_key.key_range.end, ctx)
                 .await?;
+            let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
             Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
         }
 
@@ -2413,9 +2419,9 @@ impl CompactionJobExecutor for TimelineAdaptor {
             ))
         });
 
-        let new_delta_layer = writer
-            .finish(prev.unwrap().0.next(), &self.timeline, ctx)
-            .await?;
+        let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?;
+        let new_delta_layer =
+            Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
 
         self.new_deltas.push(new_delta_layer);
         Ok(())
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index ee5f8cd52a..645b5ad2bf 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -488,10 +488,12 @@ async fn copy_lsn_prefix(
         // reuse the key instead of adding more holes between layers by using the real
         // highest key in the layer.
         let reused_highest_key = layer.layer_desc().key_range.end;
-        let copied = writer
-            .finish(reused_highest_key, target_timeline, ctx)
+        let (desc, path) = writer
+            .finish(reused_highest_key, ctx)
             .await
             .map_err(CopyDeltaPrefix)?;
+        let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path)
+            .map_err(CopyDeltaPrefix)?;
 
         tracing::debug!(%layer, %copied, "new layer produced");
 

From 22790fc9075b7108d6d26b84b3ba1e1f8a9d94e7 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Tue, 6 Aug 2024 13:55:42 -0400
Subject: [PATCH 21/48] scrubber: clean up `scan_metadata` before prod (#8565)

Part of #8128.

## Problem
Currently, scrubber `scan_metadata` command will return with an error
code if the metadata on remote storage is corrupted with fatal errors.
To safely deploy this command in a cronjob, we want to differentiate
between failures while running scrubber command and the erroneous
metadata. At the same time, we also want our regression tests to catch
corrupted metadata using the scrubber command.

## Summary of changes

- Return with error code only when the scrubber command fails
- Uses explicit checks on errors and warnings to determine metadata
health in regression tests.

**Resolve conflict with `tenant-snapshot` command (after shard split):**
[`test_scrubber_tenant_snapshot`](https://github.com/neondatabase/neon/blob/yuchen/scrubber-scan-cleanup-before-prod/test_runner/regress/test_storage_scrubber.py#L23)
failed before applying 422a8443ddb7f1c7a26907a96c4aed0c5d554e67
- When taking a snapshot, the old `index_part.json` in the unsharded
tenant directory is not kept.
- The current `list_timeline_blobs` implementation consider no
`index_part.json` as a parse error.
- During the scan, we are only analyzing shards with highest shard
count, so we will not get a parse error. but we do need to add the
layers to tenant object listing, otherwise we will get index is
referencing a layer that is not in remote storage error.
- **Action:** Add s3_layers from `list_timeline_blobs` regardless of
parsing error

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 storage_scrubber/src/checks.rs                | 14 ++++++++----
 storage_scrubber/src/main.rs                  | 10 ++++-----
 .../src/pageserver_physical_gc.rs             | 14 ++++++++----
 .../src/scan_pageserver_metadata.rs           | 22 +++++++++++++------
 storage_scrubber/src/tenant_snapshot.rs       |  2 +-
 test_runner/fixtures/neon_fixtures.py         | 14 +++++++++---
 test_runner/regress/test_compatibility.py     |  5 ++---
 .../regress/test_pageserver_generations.py    |  5 ++---
 .../regress/test_pageserver_secondary.py      |  3 ++-
 test_runner/regress/test_sharding.py          |  3 ++-
 test_runner/regress/test_storage_scrubber.py  | 11 +++++-----
 test_runner/regress/test_tenant_delete.py     |  8 +++----
 12 files changed, 70 insertions(+), 41 deletions(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 14788515dd..35ec69fd50 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -172,8 +172,11 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                     }
                 }
                 BlobDataParseResult::Relic => {}
-                BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
-                    parse_errors
+                BlobDataParseResult::Incorrect {
+                    errors,
+                    s3_layers: _,
+                } => result.errors.extend(
+                    errors
                         .into_iter()
                         .map(|error| format!("parse error: {error}")),
                 ),
@@ -300,7 +303,10 @@ pub(crate) enum BlobDataParseResult {
     },
     /// The remains of a deleted Timeline (i.e. an initdb archive only)
     Relic,
-    Incorrect(Vec<String>),
+    Incorrect {
+        errors: Vec<String>,
+        s3_layers: HashSet<(LayerName, Generation)>,
+    },
 }
 
 pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
@@ -443,7 +449,7 @@ pub(crate) async fn list_timeline_blobs(
     }
 
     Ok(S3TimelineBlobData {
-        blob_data: BlobDataParseResult::Incorrect(errors),
+        blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
         unused_index_keys: index_part_keys,
         unknown_keys,
     })
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index a111c31844..cbc836755a 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -208,21 +208,21 @@ async fn main() -> anyhow::Result<()> {
                         }
 
                         if summary.is_fatal() {
-                            Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                            tracing::error!("Fatal scrub errors detected");
                         } else if summary.is_empty() {
                             // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
                             // scrubber they were likely expecting to scan something, and if we see no timelines
                             // at all then it's likely due to some configuration issues like a bad prefix
-                            Err(anyhow::anyhow!(
+                            tracing::error!(
                                 "No timelines found in bucket {} prefix {}",
                                 bucket_config.bucket,
                                 bucket_config
                                     .prefix_in_bucket
                                     .unwrap_or("<none>".to_string())
-                            ))
-                        } else {
-                            Ok(())
+                            );
                         }
+
+                        Ok(())
                     }
                 }
             }
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 69896caa82..ff230feae3 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -389,10 +389,13 @@ async fn gc_ancestor(
                 // Post-deletion tenant location: don't try and GC it.
                 continue;
             }
-            BlobDataParseResult::Incorrect(reasons) => {
+            BlobDataParseResult::Incorrect {
+                errors,
+                s3_layers: _, // TODO(yuchen): could still check references to these s3 layers?
+            } => {
                 // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
                 tracing::warn!(
-                    "Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}"
+                    "Skipping ancestor GC for timeline {ttid}, bad metadata: {errors:?}"
                 );
                 continue;
             }
@@ -518,9 +521,12 @@ pub async fn pageserver_physical_gc(
                 // Post-deletion tenant location: don't try and GC it.
                 return Ok(summary);
             }
-            BlobDataParseResult::Incorrect(reasons) => {
+            BlobDataParseResult::Incorrect {
+                errors,
+                s3_layers: _,
+            } => {
                 // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
-                tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}");
+                tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}");
                 return Ok(summary);
             }
         };
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index dc410bde41..b9630056e1 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -290,13 +290,21 @@ pub async fn scan_metadata(
             }
         }
 
-        if let BlobDataParseResult::Parsed {
-            index_part: _index_part,
-            index_part_generation: _index_part_generation,
-            s3_layers,
-        } = &data.blob_data
-        {
-            tenant_objects.push(ttid, s3_layers.clone());
+        match &data.blob_data {
+            BlobDataParseResult::Parsed {
+                index_part: _index_part,
+                index_part_generation: _index_part_generation,
+                s3_layers,
+            } => {
+                tenant_objects.push(ttid, s3_layers.clone());
+            }
+            BlobDataParseResult::Relic => (),
+            BlobDataParseResult::Incorrect {
+                errors: _,
+                s3_layers,
+            } => {
+                tenant_objects.push(ttid, s3_layers.clone());
+            }
         }
         tenant_timeline_results.push((ttid, data));
     }
diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index 5a75f8d40e..1866e6ec80 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -269,7 +269,7 @@ impl SnapshotDownloader {
                         .context("Downloading timeline")?;
                     }
                     BlobDataParseResult::Relic => {}
-                    BlobDataParseResult::Incorrect(_) => {
+                    BlobDataParseResult::Incorrect { .. } => {
                         tracing::error!("Bad metadata in timeline {ttid}");
                     }
                 };
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7289472de2..c6f4404784 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -978,7 +978,10 @@ class NeonEnvBuilder:
                 and self.enable_scrub_on_exit
             ):
                 try:
-                    self.env.storage_scrubber.scan_metadata()
+                    healthy, _ = self.env.storage_scrubber.scan_metadata()
+                    if not healthy:
+                        e = Exception("Remote storage metadata corrupted")
+                        cleanup_error = e
                 except Exception as e:
                     log.error(f"Error during remote storage scrub: {e}")
                     cleanup_error = e
@@ -4411,14 +4414,19 @@ class StorageScrubber:
         assert stdout is not None
         return stdout
 
-    def scan_metadata(self, post_to_storage_controller: bool = False) -> Any:
+    def scan_metadata(self, post_to_storage_controller: bool = False) -> Tuple[bool, Any]:
+        """
+        Returns the health status and the metadata summary.
+        """
         args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
         if post_to_storage_controller:
             args.append("--post")
         stdout = self.scrubber_cli(args, timeout=30)
 
         try:
-            return json.loads(stdout)
+            summary = json.loads(stdout)
+            healthy = not summary["with_errors"] and not summary["with_warnings"]
+            return healthy, summary
         except:
             log.error("Failed to decode JSON output from `scan-metadata`.  Dumping stdout:")
             log.error(stdout)
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 137b0e931d..afa5f6873c 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -496,11 +496,10 @@ def test_historic_storage_formats(
     # Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt)
     #
     # Do this _before_ importing to the pageserver, as that import may start writing immediately
-    metadata_summary = env.storage_scrubber.scan_metadata()
+    healthy, metadata_summary = env.storage_scrubber.scan_metadata()
+    assert healthy
     assert metadata_summary["tenant_count"] >= 1
     assert metadata_summary["timeline_count"] >= 1
-    assert not metadata_summary["with_errors"]
-    assert not metadata_summary["with_warnings"]
 
     env.neon_cli.import_tenant(dataset.tenant_id)
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 8941ddd281..73af7950f1 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -214,12 +214,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
 
     # Having written a mixture of generation-aware and legacy index_part.json,
     # ensure the scrubber handles the situation as expected.
-    metadata_summary = env.storage_scrubber.scan_metadata()
+    healthy, metadata_summary = env.storage_scrubber.scan_metadata()
     assert metadata_summary["tenant_count"] == 1  # Scrubber should have seen our timeline
     assert metadata_summary["timeline_count"] == 1
     assert metadata_summary["timeline_shard_count"] == 1
-    assert not metadata_summary["with_errors"]
-    assert not metadata_summary["with_warnings"]
+    assert healthy
 
 
 def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 4b0af24480..8746b88a75 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -593,7 +593,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     # Scrub the remote storage
     # ========================
     # This confirms that the scrubber isn't upset by the presence of the heatmap
-    env.storage_scrubber.scan_metadata()
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy
 
     # Detach secondary and delete tenant
     # ===================================
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 7f30b2d7a7..1011a6fd22 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -124,7 +124,8 @@ def test_sharding_smoke(
 
     # Check the scrubber isn't confused by sharded content, then disable
     # it during teardown because we'll have deleted by then
-    env.storage_scrubber.scan_metadata()
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy
 
     env.storage_controller.pageserver_api().tenant_delete(tenant_id)
     assert_prefix_empty(
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index e3f627b6a6..388f6a9e92 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -516,9 +516,8 @@ def test_scrubber_scan_pageserver_metadata(
     assert len(index.layer_metadata) > 0
     it = iter(index.layer_metadata.items())
 
-    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
-    assert not scan_summary["with_warnings"]
-    assert not scan_summary["with_errors"]
+    healthy, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
+    assert healthy
 
     assert env.storage_controller.metadata_health_is_healthy()
 
@@ -532,16 +531,18 @@ def test_scrubber_scan_pageserver_metadata(
     log.info(f"delete response: {delete_response}")
 
     # Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings.
-    scan_summary = env.storage_scrubber.scan_metadata()
+    _, scan_summary = env.storage_scrubber.scan_metadata()
     log.info(f"{pprint.pformat(scan_summary)}")
     assert len(scan_summary["with_warnings"]) > 0
 
     assert env.storage_controller.metadata_health_is_healthy()
 
     # Now post to storage controller, expect seeing one unhealthy health record
-    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
+    _, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
     log.info(f"{pprint.pformat(scan_summary)}")
     assert len(scan_summary["with_warnings"]) > 0
 
     unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
     assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)
+
+    neon_env_builder.disable_scrub_on_exit()
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index c343b349cf..c01b3a2e89 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -341,13 +341,13 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
     wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
     env.stop()
 
-    result = env.storage_scrubber.scan_metadata()
-    assert result["with_warnings"] == []
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy
 
     env.start()
     ps_http = env.pageserver.http_client()
     ps_http.tenant_delete(tenant_id)
     env.stop()
 
-    env.storage_scrubber.scan_metadata()
-    assert result["with_warnings"] == []
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy

From 9a3bc5556a713dc1560268588ca7873f375a0489 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 7 Aug 2024 09:14:26 +0200
Subject: [PATCH 22/48] storage broker: only print one line for version and
 build tag in init (#8624)

This makes it more consistent with pageserver and safekeeper. Also, it
is easier to collect the two values into one data point.
---
 storage_broker/src/bin/storage_broker.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 0a4af543ab..15acd0e49c 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -642,8 +642,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     logging::replace_panic_hook_with_tracing_panic_hook().forget();
     // initialize sentry if SENTRY_DSN is provided
     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-    info!("version: {GIT_VERSION}");
-    info!("build_tag: {BUILD_TAG}");
+    info!("version: {GIT_VERSION} build_tag: {BUILD_TAG}");
     metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
 
     // On any shutdown signal, log receival and exit.

From c75e6fbc46754479b5e6cfe3e52744f1c7414ed4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 7 Aug 2024 09:29:52 +0200
Subject: [PATCH 23/48] Lower level for timeline cancellations during gc
 (#8626)

Timeline cancellation running in parallel with gc yields error log lines
like:

```
Gc failed 1 times, retrying in 2s: TimelineCancelled
```

They are completely harmless though and normal to occur. Therefore, only
print those messages at an info level. Still print them at all so that
we know what is going on if we focus on a single timeline.
---
 pageserver/src/tenant/tasks.rs | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 230362d81a..b4706ea59d 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -407,9 +407,16 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                         error_run_count += 1;
                         let wait_duration = Duration::from_secs_f64(wait_duration);
 
-                        error!(
-                        "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
-                    );
+                        if matches!(e, crate::tenant::GcError::TimelineCancelled) {
+                            // Timeline was cancelled during gc. We might either be in an event
+                            // that affects the entire tenant (tenant deletion, pageserver shutdown),
+                            // or in one that affects the timeline only (timeline deletion).
+                            // Therefore, don't exit the loop.
+                            info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
+                        } else {
+                            error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
+                        }
+
                         wait_duration
                     }
                 }

From 38f184bc9173ccb5ee664cde84be31bb19cf414e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 7 Aug 2024 14:53:52 +0200
Subject: [PATCH 24/48] Add missing colon to ArchivalConfigRequest
 specification (#8627)

Add a missing colon to the API specification of `ArchivalConfigRequest`.
The `state` field is required. Pointed out by Gleb.
---
 pageserver/src/http/openapi_spec.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 4656f2c93a..42086dc2e6 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -932,7 +932,7 @@ components:
           description: Whether to poll remote storage for layers to download.  If false, secondary locations don't download anything.
     ArchivalConfigRequest:
       type: object
-      required
+      required:
         - state
       properties:
         state:

From 1f73dfb8420ed08825dc28c6d3f443313bfceac6 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 7 Aug 2024 14:37:03 +0100
Subject: [PATCH 25/48] proxy: random changes (#8602)

## Problem

1. Hard to correlate startup parameters with the endpoint that provided
them.
2. Some configurations are not needed in the `ProxyConfig` struct.

## Summary of changes

Because of some borrow checker fun, I needed to switch to an
interior-mutability implementation of our `RequestMonitoring` context
system. Using https://docs.rs/try-lock/latest/try_lock/ as a cheap lock
for such a use-case (needed to be thread safe).

Removed the lock of each startup message, instead just logging only the
startup params in a successful handshake.

Also removed from values from `ProxyConfig` and kept as arguments.
(needed for local-proxy config)
---
 Cargo.lock                            |   5 +-
 Cargo.toml                            |   1 +
 proxy/Cargo.toml                      |   1 +
 proxy/src/auth/backend.rs             |  40 ++---
 proxy/src/auth/backend/classic.rs     |   4 +-
 proxy/src/auth/backend/hacks.rs       |   8 +-
 proxy/src/auth/backend/link.rs        |   2 +-
 proxy/src/auth/credentials.rs         |  60 ++++---
 proxy/src/auth/flow.rs                |  10 +-
 proxy/src/bin/pg_sni_router.rs        |   6 +-
 proxy/src/bin/proxy.rs                |  20 +--
 proxy/src/cache/endpoints.rs          |   2 +-
 proxy/src/compute.rs                  |  10 +-
 proxy/src/config.rs                   |   4 -
 proxy/src/console/provider.rs         |  14 +-
 proxy/src/console/provider/mock.rs    |   6 +-
 proxy/src/console/provider/neon.rs    |  22 +--
 proxy/src/context.rs                  | 241 +++++++++++++++++++-------
 proxy/src/context/parquet.rs          |   6 +-
 proxy/src/metrics.rs                  |  31 +---
 proxy/src/proxy.rs                    |  18 +-
 proxy/src/proxy/connect_compute.rs    |  16 +-
 proxy/src/proxy/handshake.rs          |  25 ++-
 proxy/src/proxy/tests.rs              |  41 ++---
 proxy/src/proxy/tests/mitm.rs         |  11 +-
 proxy/src/proxy/wake_compute.rs       |   6 +-
 proxy/src/serverless.rs               |   4 +-
 proxy/src/serverless/backend.rs       |  12 +-
 proxy/src/serverless/conn_pool.rs     |  15 +-
 proxy/src/serverless/sql_over_http.rs |  17 +-
 proxy/src/serverless/websocket.rs     |   4 +-
 31 files changed, 386 insertions(+), 276 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 764c0fbd30..f565119dbd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4324,6 +4324,7 @@ dependencies = [
  "tracing-opentelemetry",
  "tracing-subscriber",
  "tracing-utils",
+ "try-lock",
  "typed-json",
  "url",
  "urlencoding",
@@ -6563,9 +6564,9 @@ dependencies = [
 
 [[package]]
 name = "try-lock"
-version = "0.2.4"
+version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
 
 [[package]]
 name = "tungstenite"
diff --git a/Cargo.toml b/Cargo.toml
index af1c1dfc82..963841e340 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -184,6 +184,7 @@ tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
 typed-json = "0.1"
 url = "2.2"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 2f18b5fbc6..b316c53034 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -92,6 +92,7 @@ tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
+try-lock.workspace = true
 typed-json.workspace = true
 url.workspace = true
 urlencoding.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 67c4dd019e..90dea01bf3 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -218,7 +218,7 @@ impl RateBucketInfo {
 impl AuthenticationConfig {
     pub fn check_rate_limit(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         config: &AuthenticationConfig,
         secret: AuthSecret,
         endpoint: &EndpointId,
@@ -243,7 +243,7 @@ impl AuthenticationConfig {
         let limit_not_exceeded = self.rate_limiter.check(
             (
                 endpoint_int,
-                MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
+                MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet),
             ),
             password_weight,
         );
@@ -274,7 +274,7 @@ impl AuthenticationConfig {
 ///
 /// All authentication flows will emit an AuthenticationOk message if successful.
 async fn auth_quirks(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     api: &impl console::Api,
     user_info: ComputeUserInfoMaybeEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -303,8 +303,8 @@ async fn auth_quirks(
     let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
 
     // check allowed list
-    if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
+    if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
     }
 
     if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
@@ -356,7 +356,7 @@ async fn auth_quirks(
 }
 
 async fn authenticate_with_secret(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     secret: AuthSecret,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -421,7 +421,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
     pub async fn authenticate(
         self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
@@ -467,7 +467,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
 impl BackendType<'_, ComputeUserInfo, &()> {
     pub async fn get_role_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         use BackendType::*;
         match self {
@@ -478,7 +478,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
 
     pub async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         use BackendType::*;
         match self {
@@ -492,7 +492,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
 impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
         use BackendType::*;
 
@@ -514,7 +514,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
 impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
         use BackendType::*;
 
@@ -571,7 +571,7 @@ mod tests {
     impl console::Api for Auth {
         async fn get_role_secret(
             &self,
-            _ctx: &mut RequestMonitoring,
+            _ctx: &RequestMonitoring,
             _user_info: &super::ComputeUserInfo,
         ) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
             Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
@@ -579,7 +579,7 @@ mod tests {
 
         async fn get_allowed_ips_and_secret(
             &self,
-            _ctx: &mut RequestMonitoring,
+            _ctx: &RequestMonitoring,
             _user_info: &super::ComputeUserInfo,
         ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
         {
@@ -591,7 +591,7 @@ mod tests {
 
         async fn wake_compute(
             &self,
-            _ctx: &mut RequestMonitoring,
+            _ctx: &RequestMonitoring,
             _user_info: &super::ComputeUserInfo,
         ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
             unimplemented!()
@@ -665,7 +665,7 @@ mod tests {
         let (mut client, server) = tokio::io::duplex(1024);
         let mut stream = PqStream::new(Stream::from_raw(server));
 
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let api = Auth {
             ips: vec![],
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -723,7 +723,7 @@ mod tests {
         ));
 
         let _creds = auth_quirks(
-            &mut ctx,
+            &ctx,
             &api,
             user_info,
             &mut stream,
@@ -742,7 +742,7 @@ mod tests {
         let (mut client, server) = tokio::io::duplex(1024);
         let mut stream = PqStream::new(Stream::from_raw(server));
 
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let api = Auth {
             ips: vec![],
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -775,7 +775,7 @@ mod tests {
         ));
 
         let _creds = auth_quirks(
-            &mut ctx,
+            &ctx,
             &api,
             user_info,
             &mut stream,
@@ -794,7 +794,7 @@ mod tests {
         let (mut client, server) = tokio::io::duplex(1024);
         let mut stream = PqStream::new(Stream::from_raw(server));
 
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let api = Auth {
             ips: vec![],
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -828,7 +828,7 @@ mod tests {
         ));
 
         let creds = auth_quirks(
-            &mut ctx,
+            &ctx,
             &api,
             user_info,
             &mut stream,
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index b98fa63120..285fa29428 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -12,7 +12,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
 
 pub(super) async fn authenticate(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     creds: ComputeUserInfo,
     client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     config: &'static AuthenticationConfig,
@@ -27,7 +27,7 @@ pub(super) async fn authenticate(
         }
         AuthSecret::Scram(secret) => {
             info!("auth endpoint chooses SCRAM");
-            let scram = auth::Scram(&secret, &mut *ctx);
+            let scram = auth::Scram(&secret, ctx);
 
             let auth_outcome = tokio::time::timeout(
                 config.scram_protocol_timeout,
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 6b0f5e1726..56921dd949 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -18,7 +18,7 @@ use tracing::{info, warn};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub async fn authenticate_cleartext(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     secret: AuthSecret,
@@ -28,7 +28,7 @@ pub async fn authenticate_cleartext(
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
+    let paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
 
     let ep = EndpointIdInt::from(&info.endpoint);
 
@@ -60,7 +60,7 @@ pub async fn authenticate_cleartext(
 /// Similar to [`authenticate_cleartext`], but there's a specific password format,
 /// and passwords are not yet validated (we don't know how to validate them!)
 pub async fn password_hack_no_authentication(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
 ) -> auth::Result<ComputeCredentials> {
@@ -68,7 +68,7 @@ pub async fn password_hack_no_authentication(
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
+    let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
 
     let payload = AuthFlow::new(client)
         .begin(auth::PasswordHack)
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 5932e1337c..95f4614736 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -57,7 +57,7 @@ pub fn new_psql_session_id() -> String {
 }
 
 pub(super) async fn authenticate(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index d06f5614f1..8f4a392131 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -84,7 +84,7 @@ pub fn endpoint_sni(
 
 impl ComputeUserInfoMaybeEndpoint {
     pub fn parse(
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         params: &StartupMessageParams,
         sni: Option<&str>,
         common_names: Option<&HashSet<String>>,
@@ -249,8 +249,8 @@ mod tests {
     fn parse_bare_minimum() -> anyhow::Result<()> {
         // According to postgresql, only `user` should be required.
         let options = StartupMessageParams::new([("user", "john_doe")]);
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id, None);
 
@@ -264,8 +264,8 @@ mod tests {
             ("database", "world"), // should be ignored
             ("foo", "bar"),        // should be ignored
         ]);
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id, None);
 
@@ -279,9 +279,9 @@ mod tests {
         let sni = Some("foo.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
         assert_eq!(user_info.options.get_cache_key("foo"), "foo");
@@ -296,8 +296,8 @@ mod tests {
             ("options", "-ckey=1 project=bar -c geqo=off"),
         ]);
 
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
 
@@ -311,8 +311,8 @@ mod tests {
             ("options", "-ckey=1 endpoint=bar -c geqo=off"),
         ]);
 
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
 
@@ -329,8 +329,8 @@ mod tests {
             ),
         ]);
 
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert!(user_info.endpoint_id.is_none());
 
@@ -344,8 +344,8 @@ mod tests {
             ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
         ]);
 
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert!(user_info.endpoint_id.is_none());
 
@@ -359,9 +359,9 @@ mod tests {
         let sni = Some("baz.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));
 
@@ -374,16 +374,16 @@ mod tests {
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.a.com");
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.b.com");
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
 
         Ok(())
@@ -397,10 +397,9 @@ mod tests {
         let sni = Some("second.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let mut ctx = RequestMonitoring::test();
-        let err =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
-                .expect_err("should fail");
+        let ctx = RequestMonitoring::test();
+        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
+            .expect_err("should fail");
         match err {
             InconsistentProjectNames { domain, option } => {
                 assert_eq!(option, "first");
@@ -417,10 +416,9 @@ mod tests {
         let sni = Some("project.localhost");
         let common_names = Some(["example.com".into()].into());
 
-        let mut ctx = RequestMonitoring::test();
-        let err =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
-                .expect_err("should fail");
+        let ctx = RequestMonitoring::test();
+        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
+            .expect_err("should fail");
         match err {
             UnknownCommonName { cn } => {
                 assert_eq!(cn, "localhost");
@@ -438,9 +436,9 @@ mod tests {
 
         let sni = Some("project.localhost");
         let common_names = Some(["localhost".into()].into());
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
         assert_eq!(
             user_info.options.get_cache_key("project"),
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 59d1ac17f4..acf7b4f6b6 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -27,7 +27,7 @@ pub trait AuthMethod {
 pub struct Begin;
 
 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
-pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring);
+pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring);
 
 impl AuthMethod for Scram<'_> {
     #[inline(always)]
@@ -155,7 +155,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
         let Scram(secret, ctx) = self.state;
 
         // pause the timer while we communicate with the client
-        let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
+        let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
 
         // Initial client message contains the chosen auth method's name.
         let msg = self.stream.read_password_message().await?;
@@ -168,10 +168,8 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
         }
 
         match sasl.method {
-            SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256),
-            SCRAM_SHA_256_PLUS => {
-                ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus)
-            }
+            SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256),
+            SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus),
             _ => {}
         }
         info!("client chooses {}", sasl.method);
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index d7a3eb9a4d..1038fa5116 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -205,7 +205,7 @@ async fn task_main(
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 
 async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     raw_stream: S,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
@@ -256,13 +256,13 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
 }
 
 async fn handle_client(
-    mut ctx: RequestMonitoring,
+    ctx: RequestMonitoring,
     dest_suffix: Arc<String>,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
     stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
-    let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?;
+    let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?;
 
     // Cut off first part of the SNI domain
     // We receive required destination details in the format of
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index c1fd6dfd80..b44e0ddd2f 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -5,6 +5,7 @@ use aws_config::meta::region::RegionProviderChain;
 use aws_config::profile::ProfileFileCredentialsProvider;
 use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
+use aws_config::Region;
 use futures::future::Either;
 use proxy::auth;
 use proxy::auth::backend::AuthRateLimiter;
@@ -290,9 +291,10 @@ async fn main() -> anyhow::Result<()> {
     let config = build_config(&args)?;
 
     info!("Authentication backend: {}", config.auth_backend);
-    info!("Using region: {}", config.aws_region);
+    info!("Using region: {}", args.aws_region);
 
-    let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed
+    let region_provider =
+        RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone()));
     let provider_conf =
         ProviderConfig::without_region().with_region(region_provider.region().await);
     let aws_credentials_provider = {
@@ -318,7 +320,7 @@ async fn main() -> anyhow::Result<()> {
     };
     let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
         elasticache::AWSIRSAConfig::new(
-            config.aws_region.clone(),
+            args.aws_region.clone(),
             args.redis_cluster_name,
             args.redis_user_id,
         ),
@@ -376,11 +378,14 @@ async fn main() -> anyhow::Result<()> {
 
     let cancel_map = CancelMap::default();
 
+    let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
+    RateBucketInfo::validate(redis_rps_limit)?;
+
     let redis_publisher = match &regional_redis_client {
         Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
             redis_publisher.clone(),
             args.region.clone(),
-            &config.redis_rps_limit,
+            redis_rps_limit,
         )?))),
         None => None,
     };
@@ -656,7 +661,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     )?;
 
     let http_config = HttpConfig {
-        request_timeout: args.sql_over_http.sql_over_http_timeout,
         pool_options: GlobalConnPoolOptions {
             max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
             gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
@@ -676,9 +680,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
     };
 
-    let mut redis_rps_limit = args.redis_rps_limit.clone();
-    RateBucketInfo::validate(&mut redis_rps_limit)?;
-
     let config = Box::leak(Box::new(ProxyConfig {
         tls_config,
         auth_backend,
@@ -687,11 +688,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         http_config,
         authentication_config,
         require_client_ip: args.require_client_ip,
-        disable_ip_check_for_http: args.disable_ip_check_for_http,
-        redis_rps_limit,
         handshake_timeout: args.handshake_timeout,
         region: args.region.clone(),
-        aws_region: args.aws_region.clone(),
         wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
         connect_compute_locks,
         connect_to_compute_retry_config: config::RetryConfig::parse(
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 4bc10a6020..8c851790c2 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -68,7 +68,7 @@ impl EndpointsCache {
             ready: AtomicBool::new(false),
         }
     }
-    pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
+    pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
         if !self.ready.load(Ordering::Acquire) {
             return true;
         }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index f91693c704..21687160ea 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -288,12 +288,12 @@ impl ConnCfg {
     /// Connect to a corresponding compute node.
     pub async fn connect(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         allow_self_signed_compute: bool,
         aux: MetricsAuxInfo,
         timeout: Duration,
     ) -> Result<PostgresConnection, ConnectionError> {
-        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
         drop(pause);
 
@@ -316,14 +316,14 @@ impl ConnCfg {
         )?;
 
         // connect_raw() will not use TLS if sslmode is "disable"
-        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let (client, connection) = self.0.connect_raw(stream, tls).await?;
         drop(pause);
         tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
         let stream = connection.stream.into_inner();
 
         info!(
-            cold_start_info = ctx.cold_start_info.as_str(),
+            cold_start_info = ctx.cold_start_info().as_str(),
             "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
             self.0.get_ssl_mode()
         );
@@ -342,7 +342,7 @@ impl ConnCfg {
             params,
             cancel_closure,
             aux,
-            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol),
+            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
         };
 
         Ok(connection)
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 6504919760..1412095505 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -31,11 +31,8 @@ pub struct ProxyConfig {
     pub http_config: HttpConfig,
     pub authentication_config: AuthenticationConfig,
     pub require_client_ip: bool,
-    pub disable_ip_check_for_http: bool,
-    pub redis_rps_limit: Vec<RateBucketInfo>,
     pub region: String,
     pub handshake_timeout: Duration,
-    pub aws_region: String,
     pub wake_compute_retry_config: RetryConfig,
     pub connect_compute_locks: ApiLocks<Host>,
     pub connect_to_compute_retry_config: RetryConfig,
@@ -55,7 +52,6 @@ pub struct TlsConfig {
 }
 
 pub struct HttpConfig {
-    pub request_timeout: tokio::time::Duration,
     pub pool_options: GlobalConnPoolOptions,
     pub cancel_set: CancelSet,
     pub client_conn_threshold: u64,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 7a9637066f..15fc0134b3 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -292,7 +292,7 @@ pub struct NodeInfo {
 impl NodeInfo {
     pub async fn connect(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         timeout: Duration,
     ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
         self.config
@@ -330,20 +330,20 @@ pub(crate) trait Api {
     /// We still have to mock the scram to avoid leaking information that user doesn't exist.
     async fn get_role_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
     async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
 
     /// Wake up the compute node and return the corresponding connection info.
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }
@@ -363,7 +363,7 @@ pub enum ConsoleBackend {
 impl Api for ConsoleBackend {
     async fn get_role_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
         use ConsoleBackend::*;
@@ -378,7 +378,7 @@ impl Api for ConsoleBackend {
 
     async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
         use ConsoleBackend::*;
@@ -393,7 +393,7 @@ impl Api for ConsoleBackend {
 
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
         use ConsoleBackend::*;
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index cfe491f2aa..2093da7562 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -158,7 +158,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
         &self,
-        _ctx: &mut RequestMonitoring,
+        _ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         Ok(CachedRoleSecret::new_uncached(
@@ -168,7 +168,7 @@ impl super::Api for Api {
 
     async fn get_allowed_ips_and_secret(
         &self,
-        _ctx: &mut RequestMonitoring,
+        _ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         Ok((
@@ -182,7 +182,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
-        _ctx: &mut RequestMonitoring,
+        _ctx: &RequestMonitoring,
         _user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         self.do_wake_compute().map_ok(Cached::new_uncached).await
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 768cd2fdfa..7eda238b66 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -57,7 +57,7 @@ impl Api {
 
     async fn do_get_auth_info(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
         if !self
@@ -69,7 +69,7 @@ impl Api {
             info!("endpoint is not valid, skipping the request");
             return Ok(AuthInfo::default());
         }
-        let request_id = ctx.session_id.to_string();
+        let request_id = ctx.session_id().to_string();
         let application_name = ctx.console_application_name();
         async {
             let request = self
@@ -77,7 +77,7 @@ impl Api {
                 .get("proxy_get_role_secret")
                 .header("X-Request-ID", &request_id)
                 .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id)])
+                .query(&[("session_id", ctx.session_id())])
                 .query(&[
                     ("application_name", application_name.as_str()),
                     ("project", user_info.endpoint.as_str()),
@@ -87,7 +87,7 @@ impl Api {
 
             info!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
-            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
             let response = self.endpoint.execute(request).await?;
             drop(pause);
             info!(duration = ?start.elapsed(), "received http response");
@@ -130,10 +130,10 @@ impl Api {
 
     async fn do_wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<NodeInfo, WakeComputeError> {
-        let request_id = ctx.session_id.to_string();
+        let request_id = ctx.session_id().to_string();
         let application_name = ctx.console_application_name();
         async {
             let mut request_builder = self
@@ -141,7 +141,7 @@ impl Api {
                 .get("proxy_wake_compute")
                 .header("X-Request-ID", &request_id)
                 .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id)])
+                .query(&[("session_id", ctx.session_id())])
                 .query(&[
                     ("application_name", application_name.as_str()),
                     ("project", user_info.endpoint.as_str()),
@@ -156,7 +156,7 @@ impl Api {
 
             info!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
-            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
             let response = self.endpoint.execute(request).await?;
             drop(pause);
             info!(duration = ?start.elapsed(), "received http response");
@@ -192,7 +192,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         let normalized_ep = &user_info.endpoint.normalize();
@@ -226,7 +226,7 @@ impl super::Api for Api {
 
     async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         let normalized_ep = &user_info.endpoint.normalize();
@@ -268,7 +268,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         let key = user_info.endpoint_cache_key();
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index ff79ba8275..e925f67233 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -7,13 +7,14 @@ use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
 use tracing::{field::display, info, info_span, Span};
+use try_lock::TryLock;
 use uuid::Uuid;
 
 use crate::{
     console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
     intern::{BranchIdInt, ProjectIdInt},
-    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol},
+    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting},
     DbName, EndpointId, RoleName,
 };
 
@@ -28,7 +29,15 @@ pub static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>>
 ///
 /// This data should **not** be used for connection logic, only for observability and limiting purposes.
 /// All connection logic should instead use strongly typed state machines, not a bunch of Options.
-pub struct RequestMonitoring {
+pub struct RequestMonitoring(
+    /// To allow easier use of the ctx object, we have interior mutability.
+    /// I would typically use a RefCell but that would break the `Send` requirements
+    /// so we need something with thread-safety. `TryLock` is a cheap alternative
+    /// that offers similar semantics to a `RefCell` but with synchronisation.
+    TryLock<RequestMonitoringInner>,
+);
+
+struct RequestMonitoringInner {
     pub peer_addr: IpAddr,
     pub session_id: Uuid,
     pub protocol: Protocol,
@@ -85,7 +94,7 @@ impl RequestMonitoring {
             role = tracing::field::Empty,
         );
 
-        Self {
+        let inner = RequestMonitoringInner {
             peer_addr,
             session_id,
             protocol,
@@ -110,7 +119,9 @@ impl RequestMonitoring {
             disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
             latency_timer: LatencyTimer::new(protocol),
             disconnect_timestamp: None,
-        }
+        };
+
+        Self(TryLock::new(inner))
     }
 
     #[cfg(test)]
@@ -119,48 +130,177 @@ impl RequestMonitoring {
     }
 
     pub fn console_application_name(&self) -> String {
+        let this = self.0.try_lock().expect("should not deadlock");
         format!(
             "{}/{}",
-            self.application.as_deref().unwrap_or_default(),
-            self.protocol
+            this.application.as_deref().unwrap_or_default(),
+            this.protocol
         )
     }
 
-    pub fn set_rejected(&mut self, rejected: bool) {
-        self.rejected = Some(rejected);
+    pub fn set_rejected(&self, rejected: bool) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.rejected = Some(rejected);
     }
 
-    pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
+    pub fn set_cold_start_info(&self, info: ColdStartInfo) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_cold_start_info(info);
+    }
+
+    pub fn set_db_options(&self, options: StartupMessageParams) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.set_application(options.get("application_name").map(SmolStr::from));
+        if let Some(user) = options.get("user") {
+            this.set_user(user.into());
+        }
+        if let Some(dbname) = options.get("database") {
+            this.set_dbname(dbname.into());
+        }
+
+        this.pg_options = Some(options);
+    }
+
+    pub fn set_project(&self, x: MetricsAuxInfo) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        if this.endpoint_id.is_none() {
+            this.set_endpoint_id(x.endpoint_id.as_str().into())
+        }
+        this.branch = Some(x.branch_id);
+        this.project = Some(x.project_id);
+        this.set_cold_start_info(x.cold_start_info);
+    }
+
+    pub fn set_project_id(&self, project_id: ProjectIdInt) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.project = Some(project_id);
+    }
+
+    pub fn set_endpoint_id(&self, endpoint_id: EndpointId) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_endpoint_id(endpoint_id);
+    }
+
+    pub fn set_dbname(&self, dbname: DbName) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_dbname(dbname);
+    }
+
+    pub fn set_user(&self, user: RoleName) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_user(user);
+    }
+
+    pub fn set_auth_method(&self, auth_method: AuthMethod) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.auth_method = Some(auth_method);
+    }
+
+    pub fn has_private_peer_addr(&self) -> bool {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .has_private_peer_addr()
+    }
+
+    pub fn set_error_kind(&self, kind: ErrorKind) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        // Do not record errors from the private address to metrics.
+        if !this.has_private_peer_addr() {
+            Metrics::get().proxy.errors_total.inc(kind);
+        }
+        if let Some(ep) = &this.endpoint_id {
+            let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
+            let label = metric.with_labels(kind);
+            metric.get_metric(label).measure(ep);
+        }
+        this.error_kind = Some(kind);
+    }
+
+    pub fn set_success(&self) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.success = true;
+    }
+
+    pub fn log_connect(&self) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .log_connect();
+    }
+
+    pub fn protocol(&self) -> Protocol {
+        self.0.try_lock().expect("should not deadlock").protocol
+    }
+
+    pub fn span(&self) -> Span {
+        self.0.try_lock().expect("should not deadlock").span.clone()
+    }
+
+    pub fn session_id(&self) -> Uuid {
+        self.0.try_lock().expect("should not deadlock").session_id
+    }
+
+    pub fn peer_addr(&self) -> IpAddr {
+        self.0.try_lock().expect("should not deadlock").peer_addr
+    }
+
+    pub fn cold_start_info(&self) -> ColdStartInfo {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .cold_start_info
+    }
+
+    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause {
+        LatencyTimerPause {
+            ctx: self,
+            start: tokio::time::Instant::now(),
+            waiting_for,
+        }
+    }
+
+    pub fn success(&self) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .latency_timer
+            .success()
+    }
+}
+
+pub struct LatencyTimerPause<'a> {
+    ctx: &'a RequestMonitoring,
+    start: tokio::time::Instant,
+    waiting_for: Waiting,
+}
+
+impl Drop for LatencyTimerPause<'_> {
+    fn drop(&mut self) {
+        self.ctx
+            .0
+            .try_lock()
+            .expect("should not deadlock")
+            .latency_timer
+            .unpause(self.start, self.waiting_for);
+    }
+}
+
+impl RequestMonitoringInner {
+    fn set_cold_start_info(&mut self, info: ColdStartInfo) {
         self.cold_start_info = info;
         self.latency_timer.cold_start_info(info);
     }
 
-    pub fn set_db_options(&mut self, options: StartupMessageParams) {
-        self.set_application(options.get("application_name").map(SmolStr::from));
-        if let Some(user) = options.get("user") {
-            self.set_user(user.into());
-        }
-        if let Some(dbname) = options.get("database") {
-            self.set_dbname(dbname.into());
-        }
-
-        self.pg_options = Some(options);
-    }
-
-    pub fn set_project(&mut self, x: MetricsAuxInfo) {
-        if self.endpoint_id.is_none() {
-            self.set_endpoint_id(x.endpoint_id.as_str().into())
-        }
-        self.branch = Some(x.branch_id);
-        self.project = Some(x.project_id);
-        self.set_cold_start_info(x.cold_start_info);
-    }
-
-    pub fn set_project_id(&mut self, project_id: ProjectIdInt) {
-        self.project = Some(project_id);
-    }
-
-    pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
+    fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
         if self.endpoint_id.is_none() {
             self.span.record("ep", display(&endpoint_id));
             let metric = &Metrics::get().proxy.connecting_endpoints;
@@ -176,44 +316,23 @@ impl RequestMonitoring {
         }
     }
 
-    pub fn set_dbname(&mut self, dbname: DbName) {
+    fn set_dbname(&mut self, dbname: DbName) {
         self.dbname = Some(dbname);
     }
 
-    pub fn set_user(&mut self, user: RoleName) {
+    fn set_user(&mut self, user: RoleName) {
         self.span.record("role", display(&user));
         self.user = Some(user);
     }
 
-    pub fn set_auth_method(&mut self, auth_method: AuthMethod) {
-        self.auth_method = Some(auth_method);
-    }
-
-    pub fn has_private_peer_addr(&self) -> bool {
+    fn has_private_peer_addr(&self) -> bool {
         match self.peer_addr {
             IpAddr::V4(ip) => ip.is_private(),
             _ => false,
         }
     }
 
-    pub fn set_error_kind(&mut self, kind: ErrorKind) {
-        // Do not record errors from the private address to metrics.
-        if !self.has_private_peer_addr() {
-            Metrics::get().proxy.errors_total.inc(kind);
-        }
-        if let Some(ep) = &self.endpoint_id {
-            let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
-            let label = metric.with_labels(kind);
-            metric.get_metric(label).measure(ep);
-        }
-        self.error_kind = Some(kind);
-    }
-
-    pub fn set_success(&mut self) {
-        self.success = true;
-    }
-
-    pub fn log_connect(&mut self) {
+    fn log_connect(&mut self) {
         let outcome = if self.success {
             ConnectOutcome::Success
         } else {
@@ -256,7 +375,7 @@ impl RequestMonitoring {
     }
 }
 
-impl Drop for RequestMonitoring {
+impl Drop for RequestMonitoringInner {
     fn drop(&mut self) {
         if self.sender.is_some() {
             self.log_connect();
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 543a458274..bb02a476fc 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -23,7 +23,7 @@ use utils::backoff;
 
 use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT};
 
-use super::{RequestMonitoring, LOG_CHAN};
+use super::{RequestMonitoringInner, LOG_CHAN};
 
 #[derive(clap::Args, Clone, Debug)]
 pub struct ParquetUploadArgs {
@@ -118,8 +118,8 @@ impl<'a> serde::Serialize for Options<'a> {
     }
 }
 
-impl From<&RequestMonitoring> for RequestData {
-    fn from(value: &RequestMonitoring) -> Self {
+impl From<&RequestMonitoringInner> for RequestData {
+    fn from(value: &RequestMonitoringInner) -> Self {
         Self {
             session_id: value.session_id,
             peer_addr: value.peer_addr.to_string(),
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index db25ac0311..0167553e30 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -370,6 +370,7 @@ pub struct CancellationRequest {
     pub kind: CancellationOutcome,
 }
 
+#[derive(Clone, Copy)]
 pub enum Waiting {
     Cplane,
     Client,
@@ -398,12 +399,6 @@ pub struct LatencyTimer {
     outcome: ConnectOutcome,
 }
 
-pub struct LatencyTimerPause<'a> {
-    timer: &'a mut LatencyTimer,
-    start: time::Instant,
-    waiting_for: Waiting,
-}
-
 impl LatencyTimer {
     pub fn new(protocol: Protocol) -> Self {
         Self {
@@ -417,11 +412,13 @@ impl LatencyTimer {
         }
     }
 
-    pub fn pause(&mut self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
-        LatencyTimerPause {
-            timer: self,
-            start: Instant::now(),
-            waiting_for,
+    pub fn unpause(&mut self, start: Instant, waiting_for: Waiting) {
+        let dur = start.elapsed();
+        match waiting_for {
+            Waiting::Cplane => self.accumulated.cplane += dur,
+            Waiting::Client => self.accumulated.client += dur,
+            Waiting::Compute => self.accumulated.compute += dur,
+            Waiting::RetryTimeout => self.accumulated.retry += dur,
         }
     }
 
@@ -438,18 +435,6 @@ impl LatencyTimer {
     }
 }
 
-impl Drop for LatencyTimerPause<'_> {
-    fn drop(&mut self) {
-        let dur = self.start.elapsed();
-        match self.waiting_for {
-            Waiting::Cplane => self.timer.accumulated.cplane += dur,
-            Waiting::Client => self.timer.accumulated.client += dur,
-            Waiting::Compute => self.timer.accumulated.compute += dur,
-            Waiting::RetryTimeout => self.timer.accumulated.retry += dur,
-        }
-    }
-}
-
 #[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
 pub enum ConnectOutcome {
     Success,
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 3edefcf21a..2182f38fe7 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -113,18 +113,18 @@ pub async fn task_main(
                 }
             };
 
-            let mut ctx = RequestMonitoring::new(
+            let ctx = RequestMonitoring::new(
                 session_id,
                 peer_addr,
                 crate::metrics::Protocol::Tcp,
                 &config.region,
             );
-            let span = ctx.span.clone();
+            let span = ctx.span();
 
             let startup = Box::pin(
                 handle_client(
                     config,
-                    &mut ctx,
+                    &ctx,
                     cancellation_handler,
                     socket,
                     ClientMode::Tcp,
@@ -240,7 +240,7 @@ impl ReportableError for ClientRequestError {
 
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     mode: ClientMode,
@@ -248,25 +248,25 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
     info!(
-        protocol = %ctx.protocol,
+        protocol = %ctx.protocol(),
         "handling interactive connection from client"
     );
 
     let metrics = &Metrics::get().proxy;
-    let proto = ctx.protocol;
+    let proto = ctx.protocol();
     let _request_gauge = metrics.connection_requests.guard(proto);
 
     let tls = config.tls_config.as_ref();
 
     let record_handshake_error = !ctx.has_private_peer_addr();
-    let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
-    let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error);
+    let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+    let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error);
     let (mut stream, params) =
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(cancel_key_data) => {
                 return Ok(cancellation_handler
-                    .cancel_session(cancel_key_data, ctx.session_id)
+                    .cancel_session(cancel_key_data, ctx.session_id())
                     .await
                     .map(|()| None)?)
             }
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 82180aaee3..f38e43ba5a 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -46,7 +46,7 @@ pub trait ConnectMechanism {
     type Error: From<Self::ConnectError>;
     async fn connect_once(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<Self::Connection, Self::ConnectError>;
@@ -58,7 +58,7 @@ pub trait ConnectMechanism {
 pub trait ComputeConnectBackend {
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
 
     fn get_keys(&self) -> Option<&ComputeCredentialKeys>;
@@ -81,7 +81,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     async fn connect_once(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
@@ -98,7 +98,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
 /// Try to connect to the compute node, retrying if necessary.
 #[tracing::instrument(skip_all)]
 pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     mechanism: &M,
     user_info: &B,
     allow_self_signed_compute: bool,
@@ -126,7 +126,7 @@ where
         .await
     {
         Ok(res) => {
-            ctx.latency_timer.success();
+            ctx.success();
             Metrics::get().proxy.retries_metric.observe(
                 RetriesMetricGroup {
                     outcome: ConnectOutcome::Success,
@@ -178,7 +178,7 @@ where
             .await
         {
             Ok(res) => {
-                ctx.latency_timer.success();
+                ctx.success();
                 Metrics::get().proxy.retries_metric.observe(
                     RetriesMetricGroup {
                         outcome: ConnectOutcome::Success,
@@ -209,9 +209,7 @@ where
         let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
         num_retries += 1;
 
-        let pause = ctx
-            .latency_timer
-            .pause(crate::metrics::Waiting::RetryTimeout);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout);
         time::sleep(wait_duration).await;
         drop(pause);
     }
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index d488aea927..c65a5558d9 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -10,6 +10,7 @@ use tracing::{info, warn};
 use crate::{
     auth::endpoint_sni,
     config::{TlsConfig, PG_ALPN_PROTOCOL},
+    context::RequestMonitoring,
     error::ReportableError,
     metrics::Metrics,
     proxy::ERR_INSECURE_CONNECTION,
@@ -67,6 +68,7 @@ pub enum HandshakeData<S> {
 /// we also take an extra care of propagating only the select handshake errors to client.
 #[tracing::instrument(skip_all)]
 pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    ctx: &RequestMonitoring,
     stream: S,
     mut tls: Option<&TlsConfig>,
     record_handshake_error: bool,
@@ -80,8 +82,6 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     let mut stream = PqStream::new(Stream::from_raw(stream));
     loop {
         let msg = stream.read_startup_packet().await?;
-        info!("received {msg:?}");
-
         use FeStartupPacket::*;
         match msg {
             SslRequest { direct } => match stream.get_ref() {
@@ -145,16 +145,20 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
 
                         let conn_info = tls_stream.get_ref().1;
 
+                        // try parse endpoint
+                        let ep = conn_info
+                            .server_name()
+                            .and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten());
+                        if let Some(ep) = ep {
+                            ctx.set_endpoint_id(ep);
+                        }
+
                         // check the ALPN, if exists, as required.
                         match conn_info.alpn_protocol() {
                             None | Some(PG_ALPN_PROTOCOL) => {}
                             Some(other) => {
-                                // try parse ep for better error
-                                let ep = conn_info.server_name().and_then(|sni| {
-                                    endpoint_sni(sni, &tls.common_names).ok().flatten()
-                                });
                                 let alpn = String::from_utf8_lossy(other);
-                                warn!(?ep, %alpn, "unexpected ALPN");
+                                warn!(%alpn, "unexpected ALPN");
                                 return Err(HandshakeError::ProtocolViolation);
                             }
                         }
@@ -198,7 +202,12 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         .await?;
                 }
 
-                info!(?version, session_type = "normal", "successful handshake");
+                info!(
+                    ?version,
+                    ?params,
+                    session_type = "normal",
+                    "successful handshake"
+                );
                 break Ok(HandshakeData::Startup(stream, params));
             }
             // downgrade protocol version
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 5186a9e1b0..d8308c4f2a 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -155,7 +155,7 @@ impl TestAuth for Scram {
         stream: &mut PqStream<Stream<S>>,
     ) -> anyhow::Result<()> {
         let outcome = auth::AuthFlow::new(stream)
-            .begin(auth::Scram(&self.0, &mut RequestMonitoring::test()))
+            .begin(auth::Scram(&self.0, &RequestMonitoring::test()))
             .await?
             .authenticate()
             .await?;
@@ -175,10 +175,11 @@ async fn dummy_proxy(
     auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
     let (client, _) = read_proxy_protocol(client).await?;
-    let mut stream = match handshake(client, tls.as_ref(), false).await? {
-        HandshakeData::Startup(stream, _) => stream,
-        HandshakeData::Cancel(_) => bail!("cancellation not supported"),
-    };
+    let mut stream =
+        match handshake(&RequestMonitoring::test(), client, tls.as_ref(), false).await? {
+            HandshakeData::Startup(stream, _) => stream,
+            HandshakeData::Cancel(_) => bail!("cancellation not supported"),
+        };
 
     auth.authenticate(&mut stream).await?;
 
@@ -457,7 +458,7 @@ impl ConnectMechanism for TestConnectMechanism {
 
     async fn connect_once(
         &self,
-        _ctx: &mut RequestMonitoring,
+        _ctx: &RequestMonitoring,
         _node_info: &console::CachedNodeInfo,
         _timeout: std::time::Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
@@ -565,7 +566,7 @@ fn helper_create_connect_info(
 async fn connect_to_compute_success() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -573,7 +574,7 @@ async fn connect_to_compute_success() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -583,7 +584,7 @@ async fn connect_to_compute_success() {
 async fn connect_to_compute_retry() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -591,7 +592,7 @@ async fn connect_to_compute_retry() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -602,7 +603,7 @@ async fn connect_to_compute_retry() {
 async fn connect_to_compute_non_retry_1() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -610,7 +611,7 @@ async fn connect_to_compute_non_retry_1() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -621,7 +622,7 @@ async fn connect_to_compute_non_retry_1() {
 async fn connect_to_compute_non_retry_2() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -629,7 +630,7 @@ async fn connect_to_compute_non_retry_2() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -641,7 +642,7 @@ async fn connect_to_compute_non_retry_3() {
     let _ = env_logger::try_init();
     tokio::time::pause();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism =
         TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]);
     let user_info = helper_create_connect_info(&mechanism);
@@ -656,7 +657,7 @@ async fn connect_to_compute_non_retry_3() {
         backoff_factor: 2.0,
     };
     connect_to_compute(
-        &mut ctx,
+        &ctx,
         &mechanism,
         &user_info,
         false,
@@ -673,7 +674,7 @@ async fn connect_to_compute_non_retry_3() {
 async fn wake_retry() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -681,7 +682,7 @@ async fn wake_retry() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -692,7 +693,7 @@ async fn wake_retry() {
 async fn wake_non_retry() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -700,7 +701,7 @@ async fn wake_non_retry() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap_err();
     mechanism.verify();
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index d96dd0947b..c8ec2b2db6 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -34,9 +34,14 @@ async fn proxy_mitm(
     tokio::spawn(async move {
         // begin handshake with end_server
         let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await;
-        let (end_client, startup) = match handshake(client1, Some(&server_config1), false)
-            .await
-            .unwrap()
+        let (end_client, startup) = match handshake(
+            &RequestMonitoring::test(),
+            client1,
+            Some(&server_config1),
+            false,
+        )
+        .await
+        .unwrap()
         {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(_) => panic!("cancellation not supported"),
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index fef349aac0..5b06e8f054 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -14,7 +14,7 @@ use super::connect_compute::ComputeConnectBackend;
 
 pub async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     api: &B,
     config: RetryConfig,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
@@ -52,9 +52,7 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
 
         let wait_duration = retry_after(*num_retries, config);
         *num_retries += 1;
-        let pause = ctx
-            .latency_timer
-            .pause(crate::metrics::Waiting::RetryTimeout);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout);
         tokio::time::sleep(wait_duration).await;
         drop(pause);
     }
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index efa999ed7d..115bef7375 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -334,7 +334,7 @@ async fn request_handler(
             &config.region,
         );
 
-        let span = ctx.span.clone();
+        let span = ctx.span();
         info!(parent: &span, "performing websocket upgrade");
 
         let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request)
@@ -367,7 +367,7 @@ async fn request_handler(
             crate::metrics::Protocol::Http,
             &config.region,
         );
-        let span = ctx.span.clone();
+        let span = ctx.span();
 
         sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
             .instrument(span)
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 3b86c1838c..80d46c67eb 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -35,15 +35,15 @@ pub struct PoolingBackend {
 impl PoolingBackend {
     pub async fn authenticate(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         config: &AuthenticationConfig,
         conn_info: &ConnInfo,
     ) -> Result<ComputeCredentials, AuthError> {
         let user_info = conn_info.user_info.clone();
         let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
-        if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-            return Err(AuthError::ip_address_not_allowed(ctx.peer_addr));
+        if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+            return Err(AuthError::ip_address_not_allowed(ctx.peer_addr()));
         }
         if !self
             .endpoint_rate_limiter
@@ -100,7 +100,7 @@ impl PoolingBackend {
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     pub async fn connect_to_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         conn_info: ConnInfo,
         keys: ComputeCredentials,
         force_new: bool,
@@ -222,7 +222,7 @@ impl ConnectMechanism for TokioMechanism {
 
     async fn connect_once(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
@@ -240,7 +240,7 @@ impl ConnectMechanism for TokioMechanism {
             .param("client_encoding", "UTF8")
             .expect("client encoding UTF8 is always valid");
 
-        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
         let (client, connection) = permit.release_result(res)?;
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index dbc58d48ec..e1dc44dc1c 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -377,7 +377,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
 
     pub fn get(
         self: &Arc<Self>,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         conn_info: &ConnInfo,
     ) -> Result<Option<Client<C>>, HttpConnError> {
         let mut client: Option<ClientInner<C>> = None;
@@ -409,9 +409,9 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                     cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
                     "pool: reusing connection '{conn_info}'"
                 );
-                client.session.send(ctx.session_id)?;
+                client.session.send(ctx.session_id())?;
                 ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
-                ctx.latency_timer.success();
+                ctx.success();
                 return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
             }
         }
@@ -465,19 +465,19 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
 
 pub fn poll_client<C: ClientInnerExt>(
     global_pool: Arc<GlobalConnPool<C>>,
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     conn_info: ConnInfo,
     client: C,
     mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 ) -> Client<C> {
-    let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol);
-    let mut session_id = ctx.session_id;
+    let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol());
+    let mut session_id = ctx.session_id();
     let (tx, mut rx) = tokio::sync::watch::channel(session_id);
 
     let span = info_span!(parent: None, "connection", %conn_id);
-    let cold_start_info = ctx.cold_start_info;
+    let cold_start_info = ctx.cold_start_info();
     span.in_scope(|| {
         info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection");
     });
@@ -766,7 +766,6 @@ mod tests {
                 opt_in: false,
                 max_total_conns: 3,
             },
-            request_timeout: Duration::from_secs(1),
             cancel_set: CancelSet::new(0),
             client_conn_threshold: u64::MAX,
         }));
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 6400e4ac7b..77ec6b1c73 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -144,7 +144,7 @@ impl UserFacingError for ConnInfoError {
 }
 
 fn get_conn_info(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     headers: &HeaderMap,
     tls: &TlsConfig,
 ) -> Result<ConnInfo, ConnInfoError> {
@@ -224,12 +224,12 @@ fn get_conn_info(
 // TODO: return different http error codes
 pub async fn handle(
     config: &'static ProxyConfig,
-    mut ctx: RequestMonitoring,
+    ctx: RequestMonitoring,
     request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
     cancel: CancellationToken,
 ) -> Result<Response<Full<Bytes>>, ApiError> {
-    let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
+    let result = handle_inner(cancel, config, &ctx, request, backend).await;
 
     let mut response = match result {
         Ok(r) => {
@@ -482,13 +482,16 @@ fn map_isolation_level_to_headers(level: IsolationLevel) -> Option<HeaderValue>
 async fn handle_inner(
     cancel: CancellationToken,
     config: &'static ProxyConfig,
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
 ) -> Result<Response<Full<Bytes>>, SqlOverHttpError> {
-    let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol);
+    let _requeset_gauge = Metrics::get()
+        .proxy
+        .connection_requests
+        .guard(ctx.protocol());
     info!(
-        protocol = %ctx.protocol,
+        protocol = %ctx.protocol(),
         "handling interactive connection from client"
     );
 
@@ -544,7 +547,7 @@ async fn handle_inner(
                 .await?;
             // not strictly necessary to mark success here,
             // but it's just insurance for if we forget it somewhere else
-            ctx.latency_timer.success();
+            ctx.success();
             Ok::<_, HttpConnError>(client)
         }
         .map_err(SqlOverHttpError::from),
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 0d5b88f07b..4fba4d141c 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -129,7 +129,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
 
 pub async fn serve_websocket(
     config: &'static ProxyConfig,
-    mut ctx: RequestMonitoring,
+    ctx: RequestMonitoring,
     websocket: OnUpgrade,
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -145,7 +145,7 @@ pub async fn serve_websocket(
 
     let res = Box::pin(handle_client(
         config,
-        &mut ctx,
+        &ctx,
         cancellation_handler,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },

From c0776b8724aa339eb1a06b39d701bb65b74bc180 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 7 Aug 2024 17:50:09 +0300
Subject: [PATCH 26/48] fix: EphemeralFiles can outlive their Timeline via
 `enum LayerManager` (#8229)

Ephemeral files cleanup on drop but did not delay shutdown, leading to
problems with restarting the tenant. The solution is as proposed:
- make ephemeral files carry the gate guard to delay `Timeline::gate`
closing
- flush in-memory layers and strong references to those on
`Timeline::shutdown`

The above are realized by making LayerManager an `enum` with `Open` and
`Closed` variants, and fail requests to modify `LayerMap`.

Additionally:

- fix too eager anyhow conversions in compaction
- unify how we freeze layers and handle errors
- optimize likely_resident_layers to read LayerFileManager hashmap
values instead of bouncing through LayerMap

Fixes: #7830
---
 libs/utils/src/sync/gate.rs                   |   3 +-
 pageserver/benches/bench_ingest.rs            |   6 +-
 pageserver/src/http/routes.rs                 |   5 +-
 pageserver/src/tenant.rs                      |  38 +-
 pageserver/src/tenant/ephemeral_file.rs       |  45 ++-
 .../src/tenant/ephemeral_file/page_caching.rs |  10 +-
 pageserver/src/tenant/layer_map.rs            |   4 +-
 pageserver/src/tenant/mgr.rs                  |   9 +-
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   4 +-
 .../src/tenant/storage_layer/layer/tests.rs   |  20 +-
 pageserver/src/tenant/timeline.rs             | 331 +++++++++++-------
 pageserver/src/tenant/timeline/compaction.rs  |  35 +-
 .../src/tenant/timeline/detach_ancestor.rs    |  17 +-
 .../src/tenant/timeline/eviction_task.rs      |  66 ++--
 .../src/tenant/timeline/layer_manager.rs      | 214 ++++++-----
 16 files changed, 505 insertions(+), 306 deletions(-)

diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs
index 156b99a010..16ec563fa7 100644
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -78,8 +78,9 @@ impl Drop for GateGuard {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, thiserror::Error)]
 pub enum GateError {
+    #[error("gate is closed")]
     GateClosed,
 }
 
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index af2b6934c6..459394449a 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -61,7 +61,11 @@ async fn ingest(
 
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
-    let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &ctx).await?;
+    let gate = utils::sync::gate::Gate::default();
+    let entered = gate.enter().unwrap();
+
+    let layer =
+        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
 
     let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
     let ctx = RequestContext::new(
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index fdab780bfb..a983d8c4c2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1162,7 +1162,10 @@ async fn layer_map_info_handler(
     let timeline =
         active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
             .await?;
-    let layer_map_info = timeline.layer_map_info(reset).await;
+    let layer_map_info = timeline
+        .layer_map_info(reset)
+        .await
+        .map_err(|_shutdown| ApiError::ShuttingDown)?;
 
     json_response(StatusCode::OK, layer_map_info)
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 989ed0d4eb..2422ab4cf2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -601,6 +601,12 @@ impl From<PageReconstructError> for GcError {
     }
 }
 
+impl From<timeline::layer_manager::Shutdown> for GcError {
+    fn from(_: timeline::layer_manager::Shutdown) -> Self {
+        GcError::TimelineCancelled
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum LoadConfigError {
     #[error("TOML deserialization error: '{0}'")]
@@ -710,6 +716,7 @@ impl Tenant {
                     .read()
                     .await
                     .layer_map()
+                    .expect("currently loading, layer manager cannot be shutdown already")
                     .iter_historic_layers()
                     .next()
                     .is_some(),
@@ -4674,10 +4681,10 @@ mod tests {
 
         let layer_map = tline.layers.read().await;
         let level0_deltas = layer_map
-            .layer_map()
-            .get_level0_deltas()
-            .into_iter()
-            .map(|desc| layer_map.get_from_desc(&desc))
+            .layer_map()?
+            .level0_deltas()
+            .iter()
+            .map(|desc| layer_map.get_from_desc(desc))
             .collect::<Vec<_>>();
 
         assert!(!level0_deltas.is_empty());
@@ -4908,11 +4915,13 @@ mod tests {
         let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
 
         let guard = tline.layers.read().await;
-        guard.layer_map().dump(true, &ctx).await?;
+        let lm = guard.layer_map()?;
+
+        lm.dump(true, &ctx).await?;
 
         let mut reads = Vec::new();
         let mut prev = None;
-        guard.layer_map().iter_historic_layers().for_each(|desc| {
+        lm.iter_historic_layers().for_each(|desc| {
             if !desc.is_delta() {
                 prev = Some(desc.clone());
                 return;
@@ -5918,23 +5927,12 @@ mod tests {
             tline.freeze_and_flush().await?; // force create a delta layer
         }
 
-        let before_num_l0_delta_files = tline
-            .layers
-            .read()
-            .await
-            .layer_map()
-            .get_level0_deltas()
-            .len();
+        let before_num_l0_delta_files =
+            tline.layers.read().await.layer_map()?.level0_deltas().len();
 
         tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
 
-        let after_num_l0_delta_files = tline
-            .layers
-            .read()
-            .await
-            .layer_map()
-            .get_level0_deltas()
-            .len();
+        let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();
 
         assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");
 
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index bb65ae24fc..770f3ca5f0 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -29,6 +29,7 @@ impl EphemeralFile {
         conf: &PageServerConf,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
+        gate_guard: utils::sync::gate::GateGuard,
         ctx: &RequestContext,
     ) -> Result<EphemeralFile, io::Error> {
         static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
@@ -51,10 +52,12 @@ impl EphemeralFile {
         )
         .await?;
 
+        let prewarm = conf.l0_flush.prewarm_on_write();
+
         Ok(EphemeralFile {
             _tenant_shard_id: tenant_shard_id,
             _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
+            rw: page_caching::RW::new(file, prewarm, gate_guard),
         })
     }
 
@@ -161,7 +164,11 @@ mod tests {
     async fn test_ephemeral_blobs() -> Result<(), io::Error> {
         let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
 
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?;
+        let gate = utils::sync::gate::Gate::default();
+
+        let entered = gate.enter().unwrap();
+
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
 
         let pos_foo = file.write_blob(b"foo", &ctx).await?;
         assert_eq!(
@@ -215,4 +222,38 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn ephemeral_file_holds_gate_open() {
+        const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
+
+        let (conf, tenant_id, timeline_id, ctx) =
+            harness("ephemeral_file_holds_gate_open").unwrap();
+
+        let gate = utils::sync::gate::Gate::default();
+
+        let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
+            .await
+            .unwrap();
+
+        let mut closing = tokio::task::spawn(async move {
+            gate.close().await;
+        });
+
+        // gate is entered until the ephemeral file is dropped
+        // do not start paused tokio-epoll-uring has a sleep loop
+        tokio::time::pause();
+        tokio::time::timeout(FOREVER, &mut closing)
+            .await
+            .expect_err("closing cannot complete before dropping");
+
+        // this is a requirement of the reset_tenant functionality: we have to be able to restart a
+        // tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate
+        drop(file);
+
+        tokio::time::timeout(FOREVER, &mut closing)
+            .await
+            .expect("closing completes right away")
+            .expect("closing does not panic");
+    }
 }
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 43b9fff28d..0a12b64a7c 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -18,6 +18,8 @@ use super::zero_padded_read_write;
 pub struct RW {
     page_cache_file_id: page_cache::FileId,
     rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
+    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
+    _gate_guard: utils::sync::gate::GateGuard,
 }
 
 /// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
@@ -29,7 +31,11 @@ pub enum PrewarmOnWrite {
 }
 
 impl RW {
-    pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
+    pub fn new(
+        file: VirtualFile,
+        prewarm_on_write: PrewarmOnWrite,
+        _gate_guard: utils::sync::gate::GateGuard,
+    ) -> Self {
         let page_cache_file_id = page_cache::next_file_id();
         Self {
             page_cache_file_id,
@@ -38,6 +44,7 @@ impl RW {
                 file,
                 prewarm_on_write,
             )),
+            _gate_guard,
         }
     }
 
@@ -145,6 +152,7 @@ impl Drop for RW {
         // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
 
         // unlink the file
+        // we are clear to do this, because we have entered a gate
         let res = std::fs::remove_file(&self.rw.as_writer().file.path);
         if let Err(e) = res {
             if e.kind() != std::io::ErrorKind::NotFound {
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index ba9c08f6e7..844f117ea2 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -846,8 +846,8 @@ impl LayerMap {
     }
 
     /// Return all L0 delta layers
-    pub fn get_level0_deltas(&self) -> Vec<Arc<PersistentLayerDesc>> {
-        self.l0_delta_layers.to_vec()
+    pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
+        &self.l0_delta_layers
     }
 
     /// debugging function to print out the contents of the layer map
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index b5568d37b5..7901fc3554 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1767,14 +1767,9 @@ impl TenantManager {
             let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
             for timeline in timelines.values() {
                 tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink");
-                let timeline_layers = timeline
-                    .layers
-                    .read()
-                    .await
-                    .likely_resident_layers()
-                    .collect::<Vec<_>>();
+                let layers = timeline.layers.read().await;
 
-                for layer in timeline_layers {
+                for layer in layers.likely_resident_layers() {
                     let relative_path = layer
                         .local_path()
                         .strip_prefix(&parent_path)
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index bff8f7cb24..f4e965b99a 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1957,6 +1957,7 @@ pub(crate) mod test {
             .await
             .likely_resident_layers()
             .next()
+            .cloned()
             .unwrap();
 
         {
@@ -2031,7 +2032,8 @@ pub(crate) mod test {
             .read()
             .await
             .likely_resident_layers()
-            .find(|x| x != &initdb_layer)
+            .find(|&x| x != &initdb_layer)
+            .cloned()
             .unwrap();
 
         // create a copy for the timeline, so we don't overwrite the file
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index f118f3d8d8..57d93feaaf 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -385,11 +385,13 @@ impl InMemoryLayer {
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         start_lsn: Lsn,
+        gate_guard: utils::sync::gate::GateGuard,
         ctx: &RequestContext,
     ) -> Result<InMemoryLayer> {
         trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
 
-        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?;
+        let file =
+            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
         let key = InMemoryLayerFileId(file.page_cache_file_id());
 
         Ok(InMemoryLayer {
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 6b0d5f09ff..bffd2db800 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -39,7 +39,7 @@ async fn smoke_test() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -176,7 +176,7 @@ async fn smoke_test() {
     {
         let layers = &[layer];
         let mut g = timeline.layers.write().await;
-        g.finish_gc_timeline(layers);
+        g.open_mut().unwrap().finish_gc_timeline(layers);
         // this just updates the remote_physical_size for demonstration purposes
         rtc.schedule_gc_update(layers).unwrap();
     }
@@ -216,7 +216,7 @@ async fn evict_and_wait_on_wanted_deleted() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -260,7 +260,7 @@ async fn evict_and_wait_on_wanted_deleted() {
     // the deletion of the layer in remote_storage happens.
     {
         let mut layers = timeline.layers.write().await;
-        layers.finish_gc_timeline(&[layer]);
+        layers.open_mut().unwrap().finish_gc_timeline(&[layer]);
     }
 
     SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
@@ -301,7 +301,7 @@ fn read_wins_pending_eviction() {
         let layer = {
             let mut layers = {
                 let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().collect::<Vec<_>>()
+                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
             };
 
             assert_eq!(layers.len(), 1);
@@ -433,7 +433,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
         let layer = {
             let mut layers = {
                 let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().collect::<Vec<_>>()
+                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
             };
 
             assert_eq!(layers.len(), 1);
@@ -602,7 +602,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -682,7 +682,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -801,9 +801,9 @@ async fn eviction_cancellation_on_drop() {
     let (evicted_layer, not_evicted) = {
         let mut layers = {
             let mut guard = timeline.layers.write().await;
-            let layers = guard.likely_resident_layers().collect::<Vec<_>>();
+            let layers = guard.likely_resident_layers().cloned().collect::<Vec<_>>();
             // remove the layers from layermap
-            guard.finish_gc_timeline(&layers);
+            guard.open_mut().unwrap().finish_gc_timeline(&layers);
 
             layers
         };
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a05e4e0712..8f9ff78fd8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -527,6 +527,12 @@ pub(crate) enum PageReconstructError {
     MissingKey(MissingKeyError),
 }
 
+impl From<layer_manager::Shutdown> for PageReconstructError {
+    fn from(_: layer_manager::Shutdown) -> Self {
+        PageReconstructError::Cancelled
+    }
+}
+
 impl GetVectoredError {
     #[cfg(test)]
     pub(crate) fn is_missing_key_error(&self) -> bool {
@@ -534,6 +540,12 @@ impl GetVectoredError {
     }
 }
 
+impl From<layer_manager::Shutdown> for GetVectoredError {
+    fn from(_: layer_manager::Shutdown) -> Self {
+        GetVectoredError::Cancelled
+    }
+}
+
 pub struct MissingKeyError {
     key: Key,
     shard: ShardNumber,
@@ -597,6 +609,12 @@ pub(crate) enum CreateImageLayersError {
     Other(#[from] anyhow::Error),
 }
 
+impl From<layer_manager::Shutdown> for CreateImageLayersError {
+    fn from(_: layer_manager::Shutdown) -> Self {
+        CreateImageLayersError::Cancelled
+    }
+}
+
 #[derive(thiserror::Error, Debug, Clone)]
 pub(crate) enum FlushLayerError {
     /// Timeline cancellation token was cancelled
@@ -634,6 +652,12 @@ impl FlushLayerError {
     }
 }
 
+impl From<layer_manager::Shutdown> for FlushLayerError {
+    fn from(_: layer_manager::Shutdown) -> Self {
+        FlushLayerError::Cancelled
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GetVectoredError {
     #[error("timeline shutting down")]
@@ -1198,12 +1222,7 @@ impl Timeline {
     /// Hence, the result **does not represent local filesystem usage**.
     pub(crate) async fn layer_size_sum(&self) -> u64 {
         let guard = self.layers.read().await;
-        let layer_map = guard.layer_map();
-        let mut size = 0;
-        for l in layer_map.iter_historic_layers() {
-            size += l.file_size;
-        }
-        size
+        guard.layer_size_sum()
     }
 
     pub(crate) fn resident_physical_size(&self) -> u64 {
@@ -1370,16 +1389,15 @@ impl Timeline {
     // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
     // polluting the span hierarchy.
     pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
-        let to_lsn = {
+        let token = {
             // Freeze the current open in-memory layer. It will be written to disk on next
             // iteration.
             let mut g = self.write_lock.lock().await;
 
             let to_lsn = self.get_last_record_lsn();
-            self.freeze_inmem_layer_at(to_lsn, &mut g).await;
-            to_lsn
+            self.freeze_inmem_layer_at(to_lsn, &mut g).await?
         };
-        self.flush_frozen_layers_and_wait(to_lsn).await
+        self.wait_flush_completion(token).await
     }
 
     // Check if an open ephemeral layer should be closed: this provides
@@ -1393,12 +1411,20 @@ impl Timeline {
             return;
         };
 
+        // FIXME: why not early exit? because before #7927 the state would had been cleared every
+        // time, and this was missed.
+        // if write_guard.is_none() { return; }
+
         let Ok(layers_guard) = self.layers.try_read() else {
             // Don't block if the layer lock is busy
             return;
         };
 
-        let Some(open_layer) = &layers_guard.layer_map().open_layer else {
+        let Ok(lm) = layers_guard.layer_map() else {
+            return;
+        };
+
+        let Some(open_layer) = &lm.open_layer else {
             // If there is no open layer, we have no layer freezing to do.  However, we might need to generate
             // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions
             // that didn't result in writes to this shard.
@@ -1424,9 +1450,16 @@ impl Timeline {
                     );
 
                     // The flush loop will update remote consistent LSN as well as disk consistent LSN.
-                    self.flush_frozen_layers_and_wait(last_record_lsn)
-                        .await
-                        .ok();
+                    // We know there is no open layer, so we can request freezing without actually
+                    // freezing anything. This is true even if we have dropped the layers_guard, we
+                    // still hold the write_guard.
+                    let _ = async {
+                        let token = self
+                            .freeze_inmem_layer_at(last_record_lsn, &mut write_guard)
+                            .await?;
+                        self.wait_flush_completion(token).await
+                    }
+                    .await;
                 }
             }
 
@@ -1464,33 +1497,26 @@ impl Timeline {
             self.last_freeze_at.load(),
             open_layer.get_opened_at(),
         ) {
-            let at_lsn = match open_layer.info() {
+            match open_layer.info() {
                 InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
                     // We may reach this point if the layer was already frozen by not yet flushed: flushing
                     // happens asynchronously in the background.
                     tracing::debug!(
                         "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
                     );
-                    None
                 }
                 InMemoryLayerInfo::Open { .. } => {
                     // Upgrade to a write lock and freeze the layer
                     drop(layers_guard);
-                    let mut layers_guard = self.layers.write().await;
-                    let froze = layers_guard
-                        .try_freeze_in_memory_layer(
-                            current_lsn,
-                            &self.last_freeze_at,
-                            &mut write_guard,
-                        )
+                    let res = self
+                        .freeze_inmem_layer_at(current_lsn, &mut write_guard)
                         .await;
-                    Some(current_lsn).filter(|_| froze)
-                }
-            };
-            if let Some(lsn) = at_lsn {
-                let res: Result<u64, _> = self.flush_frozen_layers(lsn);
-                if let Err(e) = res {
-                    tracing::info!("failed to flush frozen layer after background freeze: {e:#}");
+
+                    if let Err(e) = res {
+                        tracing::info!(
+                            "failed to flush frozen layer after background freeze: {e:#}"
+                        );
+                    }
                 }
             }
         }
@@ -1644,6 +1670,11 @@ impl Timeline {
                     // about corner cases like s3 suddenly hanging up?
                     self.remote_client.shutdown().await;
                 }
+                Err(FlushLayerError::Cancelled) => {
+                    // this is likely the second shutdown, ignore silently.
+                    // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
+                    debug_assert!(self.cancel.is_cancelled());
+                }
                 Err(e) => {
                     // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
                     // we have some extra WAL replay to do next time the timeline starts.
@@ -1662,6 +1693,7 @@ impl Timeline {
         // Transition the remote_client into a state where it's only useful for timeline deletion.
         // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
         self.remote_client.stop();
+
         // As documented in remote_client.stop()'s doc comment, it's our responsibility
         // to shut down the upload queue tasks.
         // TODO: fix that, task management should be encapsulated inside remote_client.
@@ -1672,10 +1704,17 @@ impl Timeline {
         )
         .await;
 
-        // TODO: work toward making this a no-op. See this funciton's doc comment for more context.
+        // TODO: work toward making this a no-op. See this function's doc comment for more context.
         tracing::debug!("Waiting for tasks...");
         task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;
 
+        {
+            // Allow any remaining in-memory layers to do cleanup -- until that, they hold the gate
+            // open.
+            let mut write_guard = self.write_lock.lock().await;
+            self.layers.write().await.shutdown(&mut write_guard);
+        }
+
         // Finally wait until any gate-holders are complete.
         //
         // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks
@@ -1769,9 +1808,12 @@ impl Timeline {
         }
     }
 
-    pub(crate) async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
+    pub(crate) async fn layer_map_info(
+        &self,
+        reset: LayerAccessStatsReset,
+    ) -> Result<LayerMapInfo, layer_manager::Shutdown> {
         let guard = self.layers.read().await;
-        let layer_map = guard.layer_map();
+        let layer_map = guard.layer_map()?;
         let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
         if let Some(open_layer) = &layer_map.open_layer {
             in_memory_layers.push(open_layer.info());
@@ -1780,16 +1822,15 @@ impl Timeline {
             in_memory_layers.push(frozen_layer.info());
         }
 
-        let mut historic_layers = Vec::new();
-        for historic_layer in layer_map.iter_historic_layers() {
-            let historic_layer = guard.get_from_desc(&historic_layer);
-            historic_layers.push(historic_layer.info(reset));
-        }
+        let historic_layers = layer_map
+            .iter_historic_layers()
+            .map(|desc| guard.get_from_desc(&desc).info(reset))
+            .collect();
 
-        LayerMapInfo {
+        Ok(LayerMapInfo {
             in_memory_layers,
             historic_layers,
-        }
+        })
     }
 
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
@@ -1797,7 +1838,7 @@ impl Timeline {
         &self,
         layer_file_name: &LayerName,
     ) -> anyhow::Result<Option<bool>> {
-        let Some(layer) = self.find_layer(layer_file_name).await else {
+        let Some(layer) = self.find_layer(layer_file_name).await? else {
             return Ok(None);
         };
 
@@ -1818,7 +1859,7 @@ impl Timeline {
             .enter()
             .map_err(|_| anyhow::anyhow!("Shutting down"))?;
 
-        let Some(local_layer) = self.find_layer(layer_file_name).await else {
+        let Some(local_layer) = self.find_layer(layer_file_name).await? else {
             return Ok(None);
         };
 
@@ -2304,7 +2345,10 @@ impl Timeline {
         let mut layers = self.layers.try_write().expect(
             "in the context where we call this function, no other task has access to the object",
         );
-        layers.initialize_empty(Lsn(start_lsn.0));
+        layers
+            .open_mut()
+            .expect("in this context the LayerManager must still be open")
+            .initialize_empty(Lsn(start_lsn.0));
     }
 
     /// Scan the timeline directory, cleanup, populate the layer map, and schedule uploads for local-only
@@ -2436,7 +2480,10 @@ impl Timeline {
 
         let num_layers = loaded_layers.len();
 
-        guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
+        guard
+            .open_mut()
+            .expect("layermanager must be open during init")
+            .initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
 
         self.remote_client
             .schedule_layer_file_deletion(&needs_cleanup)?;
@@ -2471,7 +2518,7 @@ impl Timeline {
 
         // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
         drop(guard); // drop write lock, update_layer_visibility will take a read lock.
-        self.update_layer_visibility().await;
+        self.update_layer_visibility().await?;
 
         info!(
             "loaded layer map with {} layers at {}, total physical size: {}",
@@ -2893,16 +2940,17 @@ impl Timeline {
         }
     }
 
-    async fn find_layer(&self, layer_name: &LayerName) -> Option<Layer> {
+    async fn find_layer(
+        &self,
+        layer_name: &LayerName,
+    ) -> Result<Option<Layer>, layer_manager::Shutdown> {
         let guard = self.layers.read().await;
-        for historic_layer in guard.layer_map().iter_historic_layers() {
-            let historic_layer_name = historic_layer.layer_name();
-            if layer_name == &historic_layer_name {
-                return Some(guard.get_from_desc(&historic_layer));
-            }
-        }
-
-        None
+        let layer = guard
+            .layer_map()?
+            .iter_historic_layers()
+            .find(|l| &l.layer_name() == layer_name)
+            .map(|found| guard.get_from_desc(&found));
+        Ok(layer)
     }
 
     /// The timeline heatmap is a hint to secondary locations from the primary location,
@@ -2953,6 +3001,7 @@ impl Timeline {
 }
 
 impl Timeline {
+    #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
     #[allow(clippy::doc_lazy_continuation)]
     /// Get the data needed to reconstruct all keys in the provided keyspace
     ///
@@ -3104,7 +3153,7 @@ impl Timeline {
             // which turns out to be a perf bottleneck in some cases.
             if !unmapped_keyspace.is_empty() {
                 let guard = timeline.layers.read().await;
-                let layers = guard.layer_map();
+                let layers = guard.layer_map()?;
 
                 let in_memory_layer = layers.find_in_memory_layer(|l| {
                     let start_lsn = l.get_lsn_range().start;
@@ -3256,22 +3305,35 @@ impl Timeline {
         }
     }
 
+    /// Returns a non-frozen open in-memory layer for ingestion.
     ///
-    /// Get a handle to the latest layer for appending.
-    ///
+    /// Takes a witness of timeline writer state lock being held, because it makes no sense to call
+    /// this function without holding the mutex.
     async fn get_layer_for_write(
         &self,
         lsn: Lsn,
+        _guard: &tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<InMemoryLayer>> {
         let mut guard = self.layers.write().await;
+        let gate_guard = self.gate.enter().context("enter gate for inmem layer")?;
+
+        let last_record_lsn = self.get_last_record_lsn();
+        ensure!(
+            lsn > last_record_lsn,
+            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
+            lsn,
+            last_record_lsn,
+        );
+
         let layer = guard
+            .open_mut()?
             .get_layer_for_write(
                 lsn,
-                self.get_last_record_lsn(),
                 self.conf,
                 self.timeline_id,
                 self.tenant_shard_id,
+                gate_guard,
                 ctx,
             )
             .await?;
@@ -3285,21 +3347,48 @@ impl Timeline {
         self.last_record_lsn.advance(new_lsn);
     }
 
+    /// Freeze any existing open in-memory layer and unconditionally notify the flush loop.
+    ///
+    /// Unconditional flush loop notification is given because in sharded cases we will want to
+    /// leave an Lsn gap. Unsharded tenants do not have Lsn gaps.
     async fn freeze_inmem_layer_at(
         &self,
         at: Lsn,
         write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
-    ) {
+    ) -> Result<u64, FlushLayerError> {
         let frozen = {
             let mut guard = self.layers.write().await;
             guard
+                .open_mut()?
                 .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock)
                 .await
         };
+
         if frozen {
             let now = Instant::now();
             *(self.last_freeze_ts.write().unwrap()) = now;
         }
+
+        // Increment the flush cycle counter and wake up the flush task.
+        // Remember the new value, so that when we listen for the flush
+        // to finish, we know when the flush that we initiated has
+        // finished, instead of some other flush that was started earlier.
+        let mut my_flush_request = 0;
+
+        let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
+        if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
+            return Err(FlushLayerError::NotRunning(flush_loop_state));
+        }
+
+        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
+            my_flush_request = *counter + 1;
+            *counter = my_flush_request;
+            *lsn = std::cmp::max(at, *lsn);
+        });
+
+        assert_ne!(my_flush_request, 0);
+
+        Ok(my_flush_request)
     }
 
     /// Layer flusher task's main loop.
@@ -3336,7 +3425,11 @@ impl Timeline {
 
                 let layer_to_flush = {
                     let guard = self.layers.read().await;
-                    guard.layer_map().frozen_layers.front().cloned()
+                    let Ok(lm) = guard.layer_map() else {
+                        info!("dropping out of flush loop for timeline shutdown");
+                        return;
+                    };
+                    lm.frozen_layers.front().cloned()
                     // drop 'layers' lock to allow concurrent reads and writes
                 };
                 let Some(layer_to_flush) = layer_to_flush else {
@@ -3393,34 +3486,7 @@ impl Timeline {
         }
     }
 
-    /// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk.
-    /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`].
-    ///
-    /// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the
-    /// case, it means no data will be written between the top of the highest frozen layer and
-    /// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data
-    /// locally for that part of the WAL.
-    fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result<u64, FlushLayerError> {
-        // Increment the flush cycle counter and wake up the flush task.
-        // Remember the new value, so that when we listen for the flush
-        // to finish, we know when the flush that we initiated has
-        // finished, instead of some other flush that was started earlier.
-        let mut my_flush_request = 0;
-
-        let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
-        if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
-            return Err(FlushLayerError::NotRunning(flush_loop_state));
-        }
-
-        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
-            my_flush_request = *counter + 1;
-            *counter = my_flush_request;
-            *lsn = std::cmp::max(at_lsn, *lsn);
-        });
-
-        Ok(my_flush_request)
-    }
-
+    /// Waits any flush request created by [`Self::freeze_inmem_layer_at`] to complete.
     async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> {
         let mut rx = self.layer_flush_done_tx.subscribe();
         loop {
@@ -3453,11 +3519,6 @@ impl Timeline {
         }
     }
 
-    async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> {
-        let token = self.flush_frozen_layers(at_lsn)?;
-        self.wait_flush_completion(token).await
-    }
-
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
     ///
     /// Return value is the last lsn (inclusive) of the layer that was frozen.
@@ -3594,11 +3655,11 @@ impl Timeline {
         {
             let mut guard = self.layers.write().await;
 
-            if self.cancel.is_cancelled() {
-                return Err(FlushLayerError::Cancelled);
-            }
-
-            guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
+            guard.open_mut()?.finish_flush_l0_layer(
+                delta_layer_to_add.as_ref(),
+                &frozen_layer,
+                &self.metrics,
+            );
 
             if self.set_disk_consistent_lsn(disk_consistent_lsn) {
                 // Schedule remote uploads that will reflect our new disk_consistent_lsn
@@ -3806,7 +3867,9 @@ impl Timeline {
         let threshold = self.get_image_creation_threshold();
 
         let guard = self.layers.read().await;
-        let layers = guard.layer_map();
+        let Ok(layers) = guard.layer_map() else {
+            return false;
+        };
 
         let mut max_deltas = 0;
         for part_range in &partition.ranges {
@@ -4214,13 +4277,16 @@ impl Timeline {
         let mut guard = self.layers.write().await;
 
         // FIXME: we could add the images to be uploaded *before* returning from here, but right
-        // now they are being scheduled outside of write lock
-        guard.track_new_image_layers(&image_layers, &self.metrics);
+        // now they are being scheduled outside of write lock; current way is inconsistent with
+        // compaction lock order.
+        guard
+            .open_mut()?
+            .track_new_image_layers(&image_layers, &self.metrics);
         drop_wlock(guard);
         timer.stop_and_record();
 
         // Creating image layers may have caused some previously visible layers to be covered
-        self.update_layer_visibility().await;
+        self.update_layer_visibility().await?;
 
         Ok(image_layers)
     }
@@ -4379,6 +4445,12 @@ impl CompactionError {
     }
 }
 
+impl From<layer_manager::Shutdown> for CompactionError {
+    fn from(_: layer_manager::Shutdown) -> Self {
+        CompactionError::ShuttingDown
+    }
+}
+
 #[serde_as]
 #[derive(serde::Serialize)]
 struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration);
@@ -4484,11 +4556,14 @@ impl Timeline {
             .collect();
 
         if !new_images.is_empty() {
-            guard.track_new_image_layers(new_images, &self.metrics);
+            guard
+                .open_mut()?
+                .track_new_image_layers(new_images, &self.metrics);
         }
 
-        // deletion will happen later, the layer file manager calls garbage_collect_on_drop
-        guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics);
+        guard
+            .open_mut()?
+            .finish_compact_l0(&remove_layers, &insert_layers, &self.metrics);
 
         self.remote_client
             .schedule_compaction_update(&remove_layers, new_deltas)?;
@@ -4502,7 +4577,7 @@ impl Timeline {
         self: &Arc<Self>,
         mut replace_layers: Vec<(Layer, ResidentLayer)>,
         mut drop_layers: Vec<Layer>,
-    ) -> Result<(), super::upload_queue::NotInitialized> {
+    ) -> Result<(), CompactionError> {
         let mut guard = self.layers.write().await;
 
         // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
@@ -4510,7 +4585,9 @@ impl Timeline {
         replace_layers.retain(|(l, _)| guard.contains(l));
         drop_layers.retain(|l| guard.contains(l));
 
-        guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics);
+        guard
+            .open_mut()?
+            .rewrite_layers(&replace_layers, &drop_layers, &self.metrics);
 
         let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
 
@@ -4799,7 +4876,7 @@ impl Timeline {
         //
         // TODO holding a write lock is too agressive and avoidable
         let mut guard = self.layers.write().await;
-        let layers = guard.layer_map();
+        let layers = guard.layer_map()?;
         'outer: for l in layers.iter_historic_layers() {
             result.layers_total += 1;
 
@@ -4927,7 +5004,7 @@ impl Timeline {
                     }
                 })?;
 
-            guard.finish_gc_timeline(&gc_layers);
+            guard.open_mut()?.finish_gc_timeline(&gc_layers);
 
             #[cfg(feature = "testing")]
             {
@@ -5083,9 +5160,13 @@ impl Timeline {
 
         let remaining = {
             let guard = self.layers.read().await;
-            guard
-                .layer_map()
-                .iter_historic_layers()
+            let Ok(lm) = guard.layer_map() else {
+                // technically here we could look into iterating accessible layers, but downloading
+                // all layers of a shutdown timeline makes no sense regardless.
+                tracing::info!("attempted to download all layers of shutdown timeline");
+                return;
+            };
+            lm.iter_historic_layers()
                 .map(|desc| guard.get_from_desc(&desc))
                 .collect::<Vec<_>>()
         };
@@ -5195,7 +5276,7 @@ impl Timeline {
                 let last_activity_ts = layer.latest_activity();
 
                 EvictionCandidate {
-                    layer: layer.into(),
+                    layer: layer.to_owned().into(),
                     last_activity_ts,
                     relative_last_activity: finite_f32::FiniteF32::ZERO,
                 }
@@ -5280,7 +5361,7 @@ impl Timeline {
 
         {
             let mut guard = self.layers.write().await;
-            guard.force_insert_layer(image_layer);
+            guard.open_mut().unwrap().force_insert_layer(image_layer);
         }
 
         Ok(())
@@ -5324,7 +5405,7 @@ impl Timeline {
             }
 
             let guard = self.layers.read().await;
-            for layer in guard.layer_map().iter_historic_layers() {
+            for layer in guard.layer_map()?.iter_historic_layers() {
                 if layer.is_delta()
                     && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
                     && layer.lsn_range != deltas.lsn_range
@@ -5354,7 +5435,7 @@ impl Timeline {
 
         {
             let mut guard = self.layers.write().await;
-            guard.force_insert_layer(delta_layer);
+            guard.open_mut().unwrap().force_insert_layer(delta_layer);
         }
 
         Ok(())
@@ -5369,7 +5450,7 @@ impl Timeline {
     ) -> anyhow::Result<Vec<(Key, Bytes)>> {
         let mut all_data = Vec::new();
         let guard = self.layers.read().await;
-        for layer in guard.layer_map().iter_historic_layers() {
+        for layer in guard.layer_map()?.iter_historic_layers() {
             if !layer.is_delta() && layer.image_layer_lsn() == lsn {
                 let layer = guard.get_from_desc(&layer);
                 let mut reconstruct_data = ValuesReconstructState::default();
@@ -5397,7 +5478,7 @@ impl Timeline {
     ) -> anyhow::Result<Vec<super::storage_layer::PersistentLayerKey>> {
         let mut layers = Vec::new();
         let guard = self.layers.read().await;
-        for layer in guard.layer_map().iter_historic_layers() {
+        for layer in guard.layer_map()?.iter_historic_layers() {
             layers.push(layer.key());
         }
         Ok(layers)
@@ -5414,7 +5495,7 @@ impl Timeline {
 /// Tracking writes ingestion does to a particular in-memory layer.
 ///
 /// Cleared upon freezing a layer.
-struct TimelineWriterState {
+pub(crate) struct TimelineWriterState {
     open_layer: Arc<InMemoryLayer>,
     current_size: u64,
     // Previous Lsn which passed through
@@ -5522,7 +5603,10 @@ impl<'a> TimelineWriter<'a> {
     }
 
     async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> {
-        let layer = self.tl.get_layer_for_write(at, ctx).await?;
+        let layer = self
+            .tl
+            .get_layer_for_write(at, &self.write_guard, ctx)
+            .await?;
         let initial_size = layer.size().await?;
 
         let last_freeze_at = self.last_freeze_at.load();
@@ -5535,15 +5619,15 @@ impl<'a> TimelineWriter<'a> {
         Ok(())
     }
 
-    async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> {
+    async fn roll_layer(&mut self, freeze_at: Lsn) -> Result<(), FlushLayerError> {
         let current_size = self.write_guard.as_ref().unwrap().current_size;
 
         // self.write_guard will be taken by the freezing
         self.tl
             .freeze_inmem_layer_at(freeze_at, &mut self.write_guard)
-            .await;
+            .await?;
 
-        self.tl.flush_frozen_layers(freeze_at)?;
+        assert!(self.write_guard.is_none());
 
         if current_size >= self.get_checkpoint_distance() * 2 {
             warn!("Flushed oversized open layer with size {}", current_size)
@@ -5708,6 +5792,7 @@ mod tests {
         let layers = timeline.layers.read().await;
         let desc = layers
             .layer_map()
+            .unwrap()
             .iter_historic_layers()
             .next()
             .expect("must find one layer to evict");
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 276d7b4967..87ec46c0b5 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -371,7 +371,7 @@ impl Timeline {
         );
 
         let layers = self.layers.read().await;
-        for layer_desc in layers.layer_map().iter_historic_layers() {
+        for layer_desc in layers.layer_map()?.iter_historic_layers() {
             let layer = layers.get_from_desc(&layer_desc);
             if layer.metadata().shard.shard_count == self.shard_identity.count {
                 // This layer does not belong to a historic ancestor, no need to re-image it.
@@ -549,7 +549,9 @@ impl Timeline {
     ///
     /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers
     /// that we know won't be needed for reads.
-    pub(super) async fn update_layer_visibility(&self) {
+    pub(super) async fn update_layer_visibility(
+        &self,
+    ) -> Result<(), super::layer_manager::Shutdown> {
         let head_lsn = self.get_last_record_lsn();
 
         // We will sweep through layers in reverse-LSN order.  We only do historic layers.  L0 deltas
@@ -557,7 +559,7 @@ impl Timeline {
         // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
         // they will be subject to L0->L1 compaction in the near future.
         let layer_manager = self.layers.read().await;
-        let layer_map = layer_manager.layer_map();
+        let layer_map = layer_manager.layer_map()?;
 
         let readable_points = {
             let children = self.gc_info.read().unwrap().retain_lsns.clone();
@@ -580,6 +582,7 @@ impl Timeline {
         // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can
         // avoid assuming that everything at a branch point is visible.
         drop(covered);
+        Ok(())
     }
 
     /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
@@ -633,12 +636,8 @@ impl Timeline {
     ) -> Result<CompactLevel0Phase1Result, CompactionError> {
         stats.read_lock_held_spawn_blocking_startup_micros =
             stats.read_lock_acquisition_micros.till_now(); // set by caller
-        let layers = guard.layer_map();
-        let level0_deltas = layers.get_level0_deltas();
-        let mut level0_deltas = level0_deltas
-            .into_iter()
-            .map(|x| guard.get_from_desc(&x))
-            .collect_vec();
+        let layers = guard.layer_map()?;
+        let level0_deltas = layers.level0_deltas();
         stats.level0_deltas_count = Some(level0_deltas.len());
 
         // Only compact if enough layers have accumulated.
@@ -651,6 +650,11 @@ impl Timeline {
             return Ok(CompactLevel0Phase1Result::default());
         }
 
+        let mut level0_deltas = level0_deltas
+            .iter()
+            .map(|x| guard.get_from_desc(x))
+            .collect::<Vec<_>>();
+
         // Gather the files to compact in this iteration.
         //
         // Start with the oldest Level 0 delta file, and collect any other
@@ -1407,10 +1411,9 @@ impl Timeline {
         // Find the top of the historical layers
         let end_lsn = {
             let guard = self.layers.read().await;
-            let layers = guard.layer_map();
+            let layers = guard.layer_map()?;
 
-            let l0_deltas = layers.get_level0_deltas();
-            drop(guard);
+            let l0_deltas = layers.level0_deltas();
 
             // As an optimization, if we find that there are too few L0 layers,
             // bail out early. We know that the compaction algorithm would do
@@ -1782,7 +1785,7 @@ impl Timeline {
         // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
         let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
             let guard = self.layers.read().await;
-            let layers = guard.layer_map();
+            let layers = guard.layer_map()?;
             let gc_info = self.gc_info.read().unwrap();
             let mut retain_lsns_below_horizon = Vec::new();
             let gc_cutoff = gc_info.cutoffs.select_min();
@@ -2216,7 +2219,9 @@ impl Timeline {
         // Step 3: Place back to the layer map.
         {
             let mut guard = self.layers.write().await;
-            guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
+            guard
+                .open_mut()?
+                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
         };
         self.remote_client
             .schedule_compaction_update(&layer_selection, &compact_to)?;
@@ -2296,7 +2301,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
         self.flush_updates().await?;
 
         let guard = self.timeline.layers.read().await;
-        let layer_map = guard.layer_map();
+        let layer_map = guard.layer_map()?;
 
         let result = layer_map
             .iter_historic_layers()
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 645b5ad2bf..7f63b53e86 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -74,6 +74,11 @@ impl From<crate::tenant::upload_queue::NotInitialized> for Error {
         Error::ShuttingDown
     }
 }
+impl From<super::layer_manager::Shutdown> for Error {
+    fn from(_: super::layer_manager::Shutdown) -> Self {
+        Error::ShuttingDown
+    }
+}
 
 impl From<FlushLayerError> for Error {
     fn from(value: FlushLayerError) -> Self {
@@ -277,7 +282,7 @@ pub(super) async fn prepare(
 
         // between retries, these can change if compaction or gc ran in between. this will mean
         // we have to redo work.
-        partition_work(ancestor_lsn, &layers)
+        partition_work(ancestor_lsn, &layers)?
     };
 
     // TODO: layers are already sorted by something: use that to determine how much of remote
@@ -383,14 +388,14 @@ pub(super) async fn prepare(
 
 fn partition_work(
     ancestor_lsn: Lsn,
-    source_layermap: &LayerManager,
-) -> (usize, Vec<Layer>, Vec<Layer>) {
+    source: &LayerManager,
+) -> Result<(usize, Vec<Layer>, Vec<Layer>), Error> {
     let mut straddling_branchpoint = vec![];
     let mut rest_of_historic = vec![];
 
     let mut later_by_lsn = 0;
 
-    for desc in source_layermap.layer_map().iter_historic_layers() {
+    for desc in source.layer_map()?.iter_historic_layers() {
         // off by one chances here:
         // - start is inclusive
         // - end is exclusive
@@ -409,10 +414,10 @@ fn partition_work(
             &mut rest_of_historic
         };
 
-        target.push(source_layermap.get_from_desc(&desc));
+        target.push(source.get_from_desc(&desc));
     }
 
-    (later_by_lsn, straddling_branchpoint, rest_of_historic)
+    Ok((later_by_lsn, straddling_branchpoint, rest_of_historic))
 }
 
 async fn upload_rewritten_layer(
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 1ba1bf9de5..07d860eb80 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -213,51 +213,45 @@ impl Timeline {
         let mut js = tokio::task::JoinSet::new();
         {
             let guard = self.layers.read().await;
-            let layers = guard.layer_map();
-            for layer in layers.iter_historic_layers() {
-                let layer = guard.get_from_desc(&layer);
 
-                // guard against eviction while we inspect it; it might be that eviction_task and
-                // disk_usage_eviction_task both select the same layers to be evicted, and
-                // seemingly free up double the space. both succeeding is of no consequence.
+            guard
+                .likely_resident_layers()
+                .filter(|layer| {
+                    let last_activity_ts = layer.latest_activity();
 
-                if !layer.is_likely_resident() {
-                    continue;
-                }
+                    let no_activity_for = match now.duration_since(last_activity_ts) {
+                        Ok(d) => d,
+                        Err(_e) => {
+                            // We reach here if `now` < `last_activity_ts`, which can legitimately
+                            // happen if there is an access between us getting `now`, and us getting
+                            // the access stats from the layer.
+                            //
+                            // The other reason why it can happen is system clock skew because
+                            // SystemTime::now() is not monotonic, so, even if there is no access
+                            // to the layer after we get `now` at the beginning of this function,
+                            // it could be that `now`  < `last_activity_ts`.
+                            //
+                            // To distinguish the cases, we would need to record `Instant`s in the
+                            // access stats (i.e., monotonic timestamps), but then, the timestamps
+                            // values in the access stats would need to be `Instant`'s, and hence
+                            // they would be meaningless outside of the pageserver process.
+                            // At the time of writing, the trade-off is that access stats are more
+                            // valuable than detecting clock skew.
+                            return false;
+                        }
+                    };
 
-                let last_activity_ts = layer.latest_activity();
-
-                let no_activity_for = match now.duration_since(last_activity_ts) {
-                    Ok(d) => d,
-                    Err(_e) => {
-                        // We reach here if `now` < `last_activity_ts`, which can legitimately
-                        // happen if there is an access between us getting `now`, and us getting
-                        // the access stats from the layer.
-                        //
-                        // The other reason why it can happen is system clock skew because
-                        // SystemTime::now() is not monotonic, so, even if there is no access
-                        // to the layer after we get `now` at the beginning of this function,
-                        // it could be that `now`  < `last_activity_ts`.
-                        //
-                        // To distinguish the cases, we would need to record `Instant`s in the
-                        // access stats (i.e., monotonic timestamps), but then, the timestamps
-                        // values in the access stats would need to be `Instant`'s, and hence
-                        // they would be meaningless outside of the pageserver process.
-                        // At the time of writing, the trade-off is that access stats are more
-                        // valuable than detecting clock skew.
-                        continue;
-                    }
-                };
-
-                if no_activity_for > p.threshold {
+                    no_activity_for > p.threshold
+                })
+                .cloned()
+                .for_each(|layer| {
                     js.spawn(async move {
                         layer
                             .evict_and_wait(std::time::Duration::from_secs(5))
                             .await
                     });
                     stats.candidates += 1;
-                }
-            }
+                });
         };
 
         let join_all = async move {
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index e6e7bc2e77..8f20d84401 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,4 +1,4 @@
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{bail, ensure, Context};
 use itertools::Itertools;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
@@ -24,39 +24,142 @@ use crate::{
 use super::TimelineWriterState;
 
 /// Provides semantic APIs to manipulate the layer map.
-#[derive(Default)]
-pub(crate) struct LayerManager {
-    layer_map: LayerMap,
-    layer_fmgr: LayerFileManager<Layer>,
+pub(crate) enum LayerManager {
+    /// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate
+    /// the layers.
+    Open(OpenLayerManager),
+    /// Shutdown layer manager where there are no more in-memory layers and persistent layers are
+    /// read-only.
+    Closed {
+        layers: HashMap<PersistentLayerKey, Layer>,
+    },
+}
+
+impl Default for LayerManager {
+    fn default() -> Self {
+        LayerManager::Open(OpenLayerManager::default())
+    }
 }
 
 impl LayerManager {
-    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
-        self.layer_fmgr.get_from_desc(desc)
+    pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
+        // The assumption for the `expect()` is that all code maintains the following invariant:
+        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
+        self.layers()
+            .get(key)
+            .with_context(|| format!("get layer from key: {key}"))
+            .expect("not found")
+            .clone()
     }
 
-    pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
-        self.layer_fmgr.get_from_key(desc)
+    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
+        self.get_from_key(&desc.key())
     }
 
     /// Get an immutable reference to the layer map.
     ///
     /// We expect users only to be able to get an immutable layer map. If users want to make modifications,
     /// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
-    pub(crate) fn layer_map(&self) -> &LayerMap {
-        &self.layer_map
+    pub(crate) fn layer_map(&self) -> Result<&LayerMap, Shutdown> {
+        use LayerManager::*;
+        match self {
+            Open(OpenLayerManager { layer_map, .. }) => Ok(layer_map),
+            Closed { .. } => Err(Shutdown),
+        }
     }
 
+    pub(crate) fn open_mut(&mut self) -> Result<&mut OpenLayerManager, Shutdown> {
+        use LayerManager::*;
+
+        match self {
+            Open(open) => Ok(open),
+            Closed { .. } => Err(Shutdown),
+        }
+    }
+
+    /// LayerManager shutdown. The in-memory layers do cleanup on drop, so we must drop them in
+    /// order to allow shutdown to complete.
+    ///
+    /// If there was a want to flush in-memory layers, it must have happened earlier.
+    pub(crate) fn shutdown(&mut self, writer_state: &mut Option<TimelineWriterState>) {
+        use LayerManager::*;
+        match self {
+            Open(OpenLayerManager {
+                layer_map,
+                layer_fmgr: LayerFileManager(hashmap),
+            }) => {
+                let open = layer_map.open_layer.take();
+                let frozen = layer_map.frozen_layers.len();
+                let taken_writer_state = writer_state.take();
+                tracing::info!(open = open.is_some(), frozen, "dropped inmemory layers");
+                let layers = std::mem::take(hashmap);
+                *self = Closed { layers };
+                assert_eq!(open.is_some(), taken_writer_state.is_some());
+            }
+            Closed { .. } => {
+                tracing::debug!("ignoring multiple shutdowns on layer manager")
+            }
+        }
+    }
+
+    /// Sum up the historic layer sizes
+    pub(crate) fn layer_size_sum(&self) -> u64 {
+        self.layers()
+            .values()
+            .map(|l| l.layer_desc().file_size)
+            .sum()
+    }
+
+    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = &'_ Layer> + '_ {
+        self.layers().values().filter(|l| l.is_likely_resident())
+    }
+
+    pub(crate) fn contains(&self, layer: &Layer) -> bool {
+        self.contains_key(&layer.layer_desc().key())
+    }
+
+    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.layers().contains_key(key)
+    }
+
+    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
+        self.layers().keys().cloned().collect_vec()
+    }
+
+    fn layers(&self) -> &HashMap<PersistentLayerKey, Layer> {
+        use LayerManager::*;
+        match self {
+            Open(OpenLayerManager { layer_fmgr, .. }) => &layer_fmgr.0,
+            Closed { layers } => layers,
+        }
+    }
+}
+
+#[derive(Default)]
+pub(crate) struct OpenLayerManager {
+    layer_map: LayerMap,
+    layer_fmgr: LayerFileManager<Layer>,
+}
+
+impl std::fmt::Debug for OpenLayerManager {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("OpenLayerManager")
+            .field("layer_count", &self.layer_fmgr.0.len())
+            .finish()
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+#[error("layer manager has been shutdown")]
+pub(crate) struct Shutdown;
+
+impl OpenLayerManager {
     /// Called from `load_layer_map`. Initialize the layer manager with:
     /// 1. all on-disk layers
     /// 2. next open layer (with disk disk_consistent_lsn LSN)
-    pub(crate) fn initialize_local_layers(
-        &mut self,
-        on_disk_layers: Vec<Layer>,
-        next_open_layer_at: Lsn,
-    ) {
+    pub(crate) fn initialize_local_layers(&mut self, layers: Vec<Layer>, next_open_layer_at: Lsn) {
         let mut updates = self.layer_map.batch_update();
-        for layer in on_disk_layers {
+        for layer in layers {
             Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
         }
         updates.flush();
@@ -68,26 +171,19 @@ impl LayerManager {
         self.layer_map.next_open_layer_at = Some(next_open_layer_at);
     }
 
-    /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
-    /// called within `get_layer_for_write`.
+    /// Open a new writable layer to append data if there is no open layer, otherwise return the
+    /// current open layer, called within `get_layer_for_write`.
     pub(crate) async fn get_layer_for_write(
         &mut self,
         lsn: Lsn,
-        last_record_lsn: Lsn,
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
+        gate_guard: utils::sync::gate::GateGuard,
         ctx: &RequestContext,
-    ) -> Result<Arc<InMemoryLayer>> {
+    ) -> anyhow::Result<Arc<InMemoryLayer>> {
         ensure!(lsn.is_aligned());
 
-        ensure!(
-            lsn > last_record_lsn,
-            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
-            lsn,
-            last_record_lsn,
-        );
-
         // Do we have a layer open for writing already?
         let layer = if let Some(open_layer) = &self.layer_map.open_layer {
             if open_layer.get_lsn_range().start > lsn {
@@ -113,8 +209,15 @@ impl LayerManager {
                 lsn
             );
 
-            let new_layer =
-                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?;
+            let new_layer = InMemoryLayer::create(
+                conf,
+                timeline_id,
+                tenant_shard_id,
+                start_lsn,
+                gate_guard,
+                ctx,
+            )
+            .await?;
             let layer = Arc::new(new_layer);
 
             self.layer_map.open_layer = Some(layer.clone());
@@ -168,7 +271,7 @@ impl LayerManager {
         froze
     }
 
-    /// Add image layers to the layer map, called from `create_image_layers`.
+    /// Add image layers to the layer map, called from [`super::Timeline::create_image_layers`].
     pub(crate) fn track_new_image_layers(
         &mut self,
         image_layers: &[ResidentLayer],
@@ -241,7 +344,7 @@ impl LayerManager {
         self.finish_compact_l0(compact_from, compact_to, metrics)
     }
 
-    /// Called when compaction is completed.
+    /// Called post-compaction when some previous generation image layers were trimmed.
     pub(crate) fn rewrite_layers(
         &mut self,
         rewrite_layers: &[(Layer, ResidentLayer)],
@@ -330,31 +433,6 @@ impl LayerManager {
         mapping.remove(layer);
         layer.delete_on_drop();
     }
-
-    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
-        // for small layer maps, we most likely have all resident, but for larger more are likely
-        // to be evicted assuming lots of layers correlated with longer lifespan.
-
-        self.layer_map().iter_historic_layers().filter_map(|desc| {
-            self.layer_fmgr
-                .0
-                .get(&desc.key())
-                .filter(|l| l.is_likely_resident())
-                .cloned()
-        })
-    }
-
-    pub(crate) fn contains(&self, layer: &Layer) -> bool {
-        self.layer_fmgr.contains(layer)
-    }
-
-    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.layer_fmgr.contains_key(key)
-    }
-
-    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
-        self.layer_fmgr.0.keys().cloned().collect_vec()
-    }
 }
 
 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
@@ -366,24 +444,6 @@ impl<T> Default for LayerFileManager<T> {
 }
 
 impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
-    fn get_from_key(&self, key: &PersistentLayerKey) -> T {
-        // The assumption for the `expect()` is that all code maintains the following invariant:
-        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
-        self.0
-            .get(key)
-            .with_context(|| format!("get layer from key: {}", key))
-            .expect("not found")
-            .clone()
-    }
-
-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
-        self.get_from_key(&desc.key())
-    }
-
-    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.0.contains_key(key)
-    }
-
     pub(crate) fn insert(&mut self, layer: T) {
         let present = self.0.insert(layer.layer_desc().key(), layer.clone());
         if present.is_some() && cfg!(debug_assertions) {
@@ -391,10 +451,6 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
         }
     }
 
-    pub(crate) fn contains(&self, layer: &T) -> bool {
-        self.0.contains_key(&layer.layer_desc().key())
-    }
-
     pub(crate) fn remove(&mut self, layer: &T) {
         let present = self.0.remove(&layer.layer_desc().key());
         if present.is_none() && cfg!(debug_assertions) {

From 658d763915378c373bebdb86b40d905dceed2de8 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Wed, 7 Aug 2024 23:37:46 +0800
Subject: [PATCH 27/48] fix(pageserver): dump the key when it's invalid (#8633)

We see an assertion error in staging. Dump the key to guess where it was
from, and then we can fix it.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 0acd83753e..3af3f74e9c 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -107,7 +107,10 @@ impl Key {
     /// As long as Neon does not support tablespace (because of lack of access to local file system),
     /// we can assume that only some predefined namespace OIDs are used which can fit in u16
     pub fn to_i128(&self) -> i128 {
-        assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        assert!(
+            self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
+            "invalid key: {self}",
+        );
         (((self.field1 & 0x7F) as i128) << 120)
             | (((self.field2 & 0xFFFF) as i128) << 104)
             | ((self.field3 as i128) << 72)

From 40e3c913bb5588a2a3a9c8a76ab43fd7aa418140 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 7 Aug 2024 19:19:00 +0300
Subject: [PATCH 28/48] refactor(timeline_detach_ancestor): replace ordered
 reparented with a hashset (#8629)

Earlier I was thinking we'd need a (ancestor_lsn, timeline_id) ordered
list of reparented. Turns out we did not need it at all. Replace it with
an unordered hashset. Additionally refactor the reparented direct
children query out, it will later be used from more places.

Split off from #8430.

Cc: #6994
---
 .../src/models/detach_ancestor.rs             |   4 +-
 pageserver/src/tenant/mgr.rs                  |   5 +-
 pageserver/src/tenant/timeline.rs             |   6 +-
 .../src/tenant/timeline/detach_ancestor.rs    | 121 ++++++++++--------
 storage_controller/src/service.rs             |   2 +-
 test_runner/fixtures/pageserver/http.py       |   4 +-
 .../regress/test_timeline_detach_ancestor.py  |   6 +-
 7 files changed, 78 insertions(+), 70 deletions(-)

diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs
index ae5a21bab9..ad74d343ae 100644
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,6 +1,8 @@
+use std::collections::HashSet;
+
 use utils::id::TimelineId;
 
 #[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct AncestorDetached {
-    pub reparented_timelines: Vec<TimelineId>,
+    pub reparented_timelines: HashSet<TimelineId>,
 }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 7901fc3554..3f592f167e 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -13,7 +13,7 @@ use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::cmp::Ordering;
-use std::collections::{BTreeMap, HashMap};
+use std::collections::{BTreeMap, HashMap, HashSet};
 use std::ops::Deref;
 use std::sync::Arc;
 use std::time::Duration;
@@ -1966,7 +1966,8 @@ impl TenantManager {
         timeline_id: TimelineId,
         prepared: PreparedTimelineDetach,
         ctx: &RequestContext,
-    ) -> Result<Vec<TimelineId>, anyhow::Error> {
+    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
+        // FIXME: this is unnecessary, slotguard already has these semantics
         struct RevertOnDropSlot(Option<SlotGuard>);
 
         impl Drop for RevertOnDropSlot {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8f9ff78fd8..76dcb5645f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3286,10 +3286,6 @@ impl Timeline {
         Ok(ancestor.clone())
     }
 
-    pub(crate) fn get_ancestor_timeline(&self) -> Option<Arc<Timeline>> {
-        self.ancestor_timeline.clone()
-    }
-
     pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
         &self.shard_identity
     }
@@ -4366,7 +4362,7 @@ impl Timeline {
         tenant: &crate::tenant::Tenant,
         prepared: detach_ancestor::PreparedTimelineDetach,
         ctx: &RequestContext,
-    ) -> Result<Vec<TimelineId>, anyhow::Error> {
+    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
         detach_ancestor::complete(self, tenant, prepared, ctx).await
     }
 
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 7f63b53e86..3b52adc77b 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -1,4 +1,4 @@
-use std::sync::Arc;
+use std::{collections::HashSet, sync::Arc};
 
 use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
 use crate::{
@@ -146,50 +146,9 @@ pub(super) async fn prepare(
             }
         }
 
-        // detached has previously been detached; let's inspect each of the current timelines and
-        // report back the timelines which have been reparented by our detach
-        let mut all_direct_children = tenant
-            .timelines
-            .lock()
-            .unwrap()
-            .values()
-            .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
-            .map(|tl| (tl.ancestor_lsn, tl.clone()))
-            .collect::<Vec<_>>();
-
-        let mut any_shutdown = false;
-
-        all_direct_children.retain(
-            |(_, tl)| match tl.remote_client.initialized_upload_queue() {
-                Ok(accessor) => accessor
-                    .latest_uploaded_index_part()
-                    .lineage
-                    .is_reparented(),
-                Err(_shutdownalike) => {
-                    // not 100% a shutdown, but let's bail early not to give inconsistent results in
-                    // sharded enviroment.
-                    any_shutdown = true;
-                    true
-                }
-            },
-        );
-
-        if any_shutdown {
-            // it could be one or many being deleted; have client retry
-            return Err(Error::ShuttingDown);
-        }
-
-        let mut reparented = all_direct_children;
-        // why this instead of hashset? there is a reason, but I've forgotten it many times.
-        //
-        // maybe if this was a hashset we would not be able to distinguish some race condition.
-        reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
-
+        let reparented_timelines = reparented_direct_children(detached, tenant)?;
         return Ok(Progress::Done(AncestorDetached {
-            reparented_timelines: reparented
-                .into_iter()
-                .map(|(_, tl)| tl.timeline_id)
-                .collect(),
+            reparented_timelines,
         }));
     };
 
@@ -386,6 +345,57 @@ pub(super) async fn prepare(
     Ok(Progress::Prepared(guard, prepared))
 }
 
+fn reparented_direct_children(
+    detached: &Arc<Timeline>,
+    tenant: &Tenant,
+) -> Result<HashSet<TimelineId>, Error> {
+    let mut all_direct_children = tenant
+        .timelines
+        .lock()
+        .unwrap()
+        .values()
+        .filter_map(|tl| {
+            let is_direct_child = matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached));
+
+            if is_direct_child {
+                Some(tl.clone())
+            } else {
+                if let Some(timeline) = tl.ancestor_timeline.as_ref() {
+                    assert_ne!(timeline.timeline_id, detached.timeline_id, "we cannot have two timelines with the same timeline_id live");
+                }
+                None
+            }
+        })
+        // Collect to avoid lock taking order problem with Tenant::timelines and
+        // Timeline::remote_client
+        .collect::<Vec<_>>();
+
+    let mut any_shutdown = false;
+
+    all_direct_children.retain(|tl| match tl.remote_client.initialized_upload_queue() {
+        Ok(accessor) => accessor
+            .latest_uploaded_index_part()
+            .lineage
+            .is_reparented(),
+        Err(_shutdownalike) => {
+            // not 100% a shutdown, but let's bail early not to give inconsistent results in
+            // sharded enviroment.
+            any_shutdown = true;
+            true
+        }
+    });
+
+    if any_shutdown {
+        // it could be one or many being deleted; have client retry
+        return Err(Error::ShuttingDown);
+    }
+
+    Ok(all_direct_children
+        .into_iter()
+        .map(|tl| tl.timeline_id)
+        .collect())
+}
+
 fn partition_work(
     ancestor_lsn: Lsn,
     source: &LayerManager,
@@ -544,11 +554,12 @@ pub(super) async fn complete(
     tenant: &Tenant,
     prepared: PreparedTimelineDetach,
     _ctx: &RequestContext,
-) -> Result<Vec<TimelineId>, anyhow::Error> {
+) -> Result<HashSet<TimelineId>, anyhow::Error> {
     let PreparedTimelineDetach { layers } = prepared;
 
     let ancestor = detached
-        .get_ancestor_timeline()
+        .ancestor_timeline
+        .as_ref()
         .expect("must still have a ancestor");
     let ancestor_lsn = detached.get_ancestor_lsn();
 
@@ -588,7 +599,7 @@ pub(super) async fn complete(
             }
 
             let tl_ancestor = tl.ancestor_timeline.as_ref()?;
-            let is_same = Arc::ptr_eq(&ancestor, tl_ancestor);
+            let is_same = Arc::ptr_eq(ancestor, tl_ancestor);
             let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
 
             let is_deleting = tl
@@ -629,13 +640,18 @@ pub(super) async fn complete(
         });
 
     let reparenting_candidates = tasks.len();
-    let mut reparented = Vec::with_capacity(tasks.len());
+    let mut reparented = HashSet::with_capacity(tasks.len());
 
     while let Some(res) = tasks.join_next().await {
         match res {
             Ok(Some(timeline)) => {
                 tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
-                reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
+
+                assert!(
+                    reparented.insert(timeline.timeline_id),
+                    "duplicate reparenting? timeline_id={}",
+                    timeline.timeline_id
+                );
             }
             Ok(None) => {
                 // lets just ignore this for now. one or all reparented timelines could had
@@ -657,12 +673,5 @@ pub(super) async fn complete(
         tracing::info!("failed to reparent some candidates");
     }
 
-    reparented.sort_unstable();
-
-    let reparented = reparented
-        .into_iter()
-        .map(|(_, timeline_id)| timeline_id)
-        .collect();
-
     Ok(reparented)
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 6940bf2c64..e391ce65e6 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2954,7 +2954,6 @@ impl Service {
         }
 
         // no shard needs to go first/last; the operation should be idempotent
-        // TODO: it would be great to ensure that all shards return the same error
         let mut results = self
             .tenant_for_shards(targets, |tenant_shard_id, node| {
                 futures::FutureExt::boxed(detach_one(
@@ -2973,6 +2972,7 @@ impl Service {
             .filter(|(_, res)| res != &any.1)
             .collect::<Vec<_>>();
         if !mismatching.is_empty() {
+            // this can be hit by races which should not happen because operation lock on cplane
             let matching = results.len() - mismatching.len();
             tracing::error!(
                 matching,
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 5be59d3749..65d6ff5d62 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -857,7 +857,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         timeline_id: TimelineId,
         batch_size: int | None = None,
         **kwargs,
-    ) -> List[TimelineId]:
+    ) -> Set[TimelineId]:
         params = {}
         if batch_size is not None:
             params["batch_size"] = batch_size
@@ -868,7 +868,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         )
         self.verbose_error(res)
         json = res.json()
-        return list(map(TimelineId, json["reparented_timelines"]))
+        return set(map(TimelineId, json["reparented_timelines"]))
 
     def evict_layer(
         self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 38f8dfa885..b3767a2766 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -165,7 +165,7 @@ def test_ancestor_detach_branched_from(
     )
 
     all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
-    assert all_reparented == []
+    assert all_reparented == set()
 
     if restart_after:
         env.pageserver.stop()
@@ -534,7 +534,7 @@ def test_compaction_induced_by_detaches_in_history(
 
     for _, timeline_id in skip_main:
         reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
-        assert reparented == [], "we have no earlier branches at any level"
+        assert reparented == set(), "we have no earlier branches at any level"
 
     post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id)))
     assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total"
@@ -774,7 +774,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         else:
             break
 
-    assert reparented == [], "too many retries (None) or unexpected reparentings"
+    assert reparented == set(), "too many retries (None) or unexpected reparentings"
 
     for shard_info in shards:
         node_id = int(shard_info["node_id"])

From e082226a32b91e53a3227ce66bf9b66d30654bd0 Mon Sep 17 00:00:00 2001
From: Cihan Demirci <128653800+fcdm@users.noreply.github.com>
Date: Wed, 7 Aug 2024 19:53:47 +0300
Subject: [PATCH 29/48] cicd: push build-tools image to ACR as well (#8638)

https://github.com/neondatabase/cloud/issues/15899
---
 .github/workflows/pin-build-tools-image.yml | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index 024594532f..cf10910b0b 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -66,8 +66,22 @@ jobs:
           username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
           password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
+      - name: Azure login
+        if: steps.check-manifests.outputs.skip == 'false'
+        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
+        with:
+          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
+          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
+
+      - name: Login to ACR
+        if: steps.check-manifests.outputs.skip == 'false'
+        run: |
+          az acr login --name=neoneastus2
+
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR and ACR
         if: steps.check-manifests.outputs.skip == 'false'
         run: |
           docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
+                                          -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
                                              neondatabase/build-tools:${FROM_TAG}

From 77bb6c4cc459268d6b95b4c543a6638b873a6af4 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 7 Aug 2024 16:04:19 -0400
Subject: [PATCH 30/48] feat(pageserver): add direct io pageserver config
 (#8622)

Part of #8130, [RFC: Direct IO For Pageserver](https://github.com/neondatabase/neon/blob/problame/direct-io-rfc/docs/rfcs/034-direct-io-for-pageserver.md)

## Description

Add pageserver config for evaluating/enabling direct I/O.

- Disabled: current default, uses buffered io as is.
- Evaluate: still uses buffered io, but could do alignment checking and
perf simulation (pad latency by direct io RW to a fake file).
- Enabled: uses direct io, behavior on alignment error is configurable.


Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 libs/pageserver_api/src/models.rs | 49 +++++++++++++++++++++++++++++++
 pageserver/src/bin/pageserver.rs  |  1 +
 pageserver/src/config.rs          | 17 +++++++++++
 pageserver/src/virtual_file.rs    |  1 +
 4 files changed, 68 insertions(+)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index b541bba6a1..ab4adfbebe 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -947,6 +947,8 @@ pub struct TopTenantShardsResponse {
 }
 
 pub mod virtual_file {
+    use std::path::PathBuf;
+
     #[derive(
         Copy,
         Clone,
@@ -965,6 +967,53 @@ pub mod virtual_file {
         #[cfg(target_os = "linux")]
         TokioEpollUring,
     }
+
+    /// Direct IO modes for a pageserver.
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+    pub enum DirectIoMode {
+        /// Direct IO disabled (uses usual buffered IO).
+        #[default]
+        Disabled,
+        /// Direct IO disabled (performs checks and perf simulations).
+        Evaluate {
+            /// Alignment check level
+            alignment_check: DirectIoAlignmentCheckLevel,
+            /// Latency padded for performance simulation.
+            latency_padding: DirectIoLatencyPadding,
+        },
+        /// Direct IO enabled.
+        Enabled {
+            /// Actions to perform on alignment error.
+            on_alignment_error: DirectIoOnAlignmentErrorAction,
+        },
+    }
+
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(rename_all = "kebab-case")]
+    pub enum DirectIoAlignmentCheckLevel {
+        #[default]
+        Error,
+        Log,
+        None,
+    }
+
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(rename_all = "kebab-case")]
+    pub enum DirectIoOnAlignmentErrorAction {
+        Error,
+        #[default]
+        FallbackToBuffered,
+    }
+
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(tag = "type", rename_all = "kebab-case")]
+    pub enum DirectIoLatencyPadding {
+        /// Pad virtual file operations with IO to a fake file.
+        FakeFileRW { path: PathBuf },
+        #[default]
+        None,
+    }
 }
 
 // Wrapped in libpq CopyData
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 5ebd6511ac..932918410c 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -123,6 +123,7 @@ fn main() -> anyhow::Result<()> {
 
     // after setting up logging, log the effective IO engine choice and read path implementations
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
+    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
     info!(?conf.get_impl, "starting with get page implementation");
     info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
     info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 41c2fe0af3..f4c367bd4d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -300,6 +300,9 @@ pub struct PageServerConf {
     /// This flag is temporary and will be removed after gradual rollout.
     /// See <https://github.com/neondatabase/neon/issues/8184>.
     pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
+
+    /// Direct IO settings
+    pub virtual_file_direct_io: virtual_file::DirectIoMode,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -408,6 +411,8 @@ struct PageServerConfigBuilder {
     l0_flush: BuilderValue<L0FlushConfig>,
 
     compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
+
+    virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
 }
 
 impl PageServerConfigBuilder {
@@ -498,6 +503,7 @@ impl PageServerConfigBuilder {
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: Set(L0FlushConfig::default()),
             compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
+            virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
         }
     }
 }
@@ -685,6 +691,10 @@ impl PageServerConfigBuilder {
         self.compact_level0_phase1_value_access = BuilderValue::Set(value);
     }
 
+    pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) {
+        self.virtual_file_direct_io = BuilderValue::Set(value);
+    }
+
     pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -743,6 +753,7 @@ impl PageServerConfigBuilder {
                 ephemeral_bytes_per_memory_kb,
                 l0_flush,
                 compact_level0_phase1_value_access,
+                virtual_file_direct_io,
             }
             CUSTOM LOGIC
             {
@@ -1018,6 +1029,9 @@ impl PageServerConf {
                 "compact_level0_phase1_value_access" => {
                     builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
                 }
+                "virtual_file_direct_io" => {
+                    builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1103,6 +1117,7 @@ impl PageServerConf {
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
             l0_flush: L0FlushConfig::default(),
             compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+            virtual_file_direct_io: virtual_file::DirectIoMode::default(),
         }
     }
 }
@@ -1345,6 +1360,7 @@ background_task_maximum_delay = '334 s'
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1420,6 +1436,7 @@ background_task_maximum_delay = '334 s'
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 51b0c420c3..8de646469e 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -34,6 +34,7 @@ pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
 use self::owned_buffers_io::write::OwnedAsyncWriter;
+pub(crate) use api::DirectIoMode;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;

From acaacd468097cb83cdcbd7f74284ada964d08bc0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 7 Aug 2024 21:17:08 +0100
Subject: [PATCH 31/48] pageserver: make bench_ingest build (but panic) on
 macOS (#8641)

## Problem

Some developers build on MacOS, which doesn't have  io_uring.

## Summary of changes

- Add `io_engine_for_bench`, which on linux will give io_uring or panic
if it's unavailable, and on MacOS will always panic.

We do not want to run such benchmarks with StdFs: the results aren't
interesting, and will actively waste the time of any developers who
start investigating performance before they realize they're using a
known-slow I/O backend.

Why not just conditionally compile this benchmark on linux only? Because
even on linux, I still want it to refuse to run if it can't get
io_uring.
---
 pageserver/benches/bench_ingest.rs       |  4 ++--
 pageserver/src/virtual_file.rs           |  1 +
 pageserver/src/virtual_file/io_engine.rs | 26 ++++++++++++++++++++++++
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 459394449a..9bab02e46c 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -11,7 +11,7 @@ use pageserver::{
     repository::Value,
     task_mgr::TaskKind,
     tenant::storage_layer::InMemoryLayer,
-    virtual_file::{self, api::IoEngineKind},
+    virtual_file,
 };
 use pageserver_api::{key::Key, shard::TenantShardId};
 use utils::{
@@ -149,7 +149,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     let conf: &'static PageServerConf = Box::leak(Box::new(
         pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
     ));
-    virtual_file::init(16384, IoEngineKind::TokioEpollUring);
+    virtual_file::init(16384, virtual_file::io_engine_for_bench());
     page_cache::init(conf.page_cache_size);
 
     {
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 8de646469e..27f6fe90a4 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -30,6 +30,7 @@ use tokio::time::Instant;
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
 pub use io_engine::feature_test as io_engine_feature_test;
+pub use io_engine::io_engine_for_bench;
 pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 2820cea097..0ffcd9fa05 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -328,3 +328,29 @@ pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
     .join()
     .unwrap()
 }
+
+/// For use in benchmark binaries only.
+///
+/// Benchmarks which initialize `virtual_file` need to know what engine to use, but we also
+/// don't want to silently fall back to slower I/O engines in a benchmark: this could waste
+/// developer time trying to figure out why it's slow.
+///
+/// In practice, this method will either return IoEngineKind::TokioEpollUring, or panic.
+pub fn io_engine_for_bench() -> IoEngineKind {
+    #[cfg(not(target_os = "linux"))]
+    {
+        panic!("This benchmark does I/O and can only give a representative result on Linux");
+    }
+    #[cfg(target_os = "linux")]
+    {
+        match feature_test().unwrap() {
+            FeatureTestResult::PlatformPreferred(engine) => engine,
+            FeatureTestResult::Worse {
+                engine: _engine,
+                remark,
+            } => {
+                panic!("This benchmark does I/O can requires the preferred I/O engine: {remark}");
+            }
+        }
+    }
+}

From 2a210d4c58e57e1d22a28be4433907adfc944698 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 8 Aug 2024 10:23:57 +0300
Subject: [PATCH 32/48] Use sycnhronous commit for logical replicaiton worker
 (#8645)

## Problem

See
https://neondb.slack.com/archives/C03QLRH7PPD/p1723038557449239?thread_ts=1722868375.476789&cid=C03QLRH7PPD


Logical replication subscription by default use `synchronous_commit=off`
which cause problems with safekeeper

## Summary of changes

Set `synchronous_commit=on` for logical replication subscription in
test_subscriber_restart.py

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 test_runner/regress/test_subscriber_restart.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
index 91caad7220..4581008022 100644
--- a/test_runner/regress/test_subscriber_restart.py
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -37,7 +37,9 @@ def test_subscriber_restart(neon_simple_env: NeonEnv):
             scur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
             # scur.execute("CREATE INDEX on t(sk)") # slowdown applying WAL at replica
             pub_conn = f"host=localhost port={pub.pg_port} dbname=postgres user=cloud_admin"
-            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
+            # synchronous_commit=on to test a hypothesis for why this test has been flaky.
+            # XXX: Add link to the issue
+            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub with (synchronous_commit=on)"
             scur.execute(query)
             time.sleep(2)  # let initial table sync complete
 

From 48ae1214c5385f6feb645e3f10598fdfc9a33191 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 8 Aug 2024 12:34:47 +0300
Subject: [PATCH 33/48] fix(test): do not fail test for filesystem race (#8643)

evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8632/10287641784/index.html#suites/0e58fb04d9998963e98e45fe1880af7d/c7a46335515142b/
---
 pageserver/src/statvfs.rs | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs
index 45a516566f..ede1791afa 100644
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -56,7 +56,6 @@ impl Statvfs {
 }
 
 pub mod mock {
-    use anyhow::Context;
     use camino::Utf8Path;
     use regex::Regex;
     use tracing::log::info;
@@ -135,14 +134,30 @@ pub mod mock {
             {
                 continue;
             }
-            total += entry
-                .metadata()
-                .with_context(|| format!("get metadata of {:?}", entry.path()))?
-                .len();
+            let m = match entry.metadata() {
+                Ok(m) => m,
+                Err(e) if is_not_found(&e) => {
+                    // some temp file which got removed right as we are walking
+                    continue;
+                }
+                Err(e) => {
+                    return Err(anyhow::Error::new(e)
+                        .context(format!("get metadata of {:?}", entry.path())))
+                }
+            };
+            total += m.len();
         }
         Ok(total)
     }
 
+    fn is_not_found(e: &walkdir::Error) -> bool {
+        let Some(io_error) = e.io_error() else {
+            return false;
+        };
+        let kind = io_error.kind();
+        matches!(kind, std::io::ErrorKind::NotFound)
+    }
+
     pub struct Statvfs {
         pub blocks: u64,
         pub blocks_available: u64,

From 552832b81972d8facc609f77944c16dba2d092d5 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 8 Aug 2024 14:02:53 +0300
Subject: [PATCH 34/48] fix: stop leaking BackgroundPurges (#8650)

avoid "leaking" the completions of BackgroundPurges by:

1. switching it to TaskTracker for provided close+wait
2. stop using tokio::fs::remove_dir_all which will consume two units of
memory instead of one blocking task

Additionally, use more graceful shutdown in tests which do actually some
background cleanup.
---
 pageserver/src/tenant/mgr.rs              | 96 +++++++----------------
 test_runner/regress/test_tenant_delete.py | 11 ++-
 2 files changed, 36 insertions(+), 71 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 3f592f167e..3316627540 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -224,21 +224,8 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 }
 
 /// See [`Self::spawn`].
-#[derive(Clone)]
-pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
-enum BackgroundPurgesInner {
-    Open(tokio::task::JoinSet<()>),
-    // we use the async mutex for coalescing
-    ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
-}
-
-impl Default for BackgroundPurges {
-    fn default() -> Self {
-        Self(Arc::new(std::sync::Mutex::new(
-            BackgroundPurgesInner::Open(JoinSet::new()),
-        )))
-    }
-}
+#[derive(Clone, Default)]
+pub struct BackgroundPurges(tokio_util::task::TaskTracker);
 
 impl BackgroundPurges {
     /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
@@ -247,24 +234,32 @@ impl BackgroundPurges {
     /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
     /// Thus the [`BackgroundPurges`] type to keep track of these tasks.
     pub fn spawn(&self, tmp_path: Utf8PathBuf) {
-        let mut guard = self.0.lock().unwrap();
-        let jset = match &mut *guard {
-            BackgroundPurgesInner::Open(ref mut jset) => jset,
-            BackgroundPurgesInner::ShuttingDown(_) => {
-                warn!("trying to spawn background purge during shutdown, ignoring");
-                return;
+        // because on shutdown we close and wait, we are misusing TaskTracker a bit.
+        //
+        // so first acquire a token, then check if the tracker has been closed. the tracker might get closed
+        // right after, but at least the shutdown will wait for what we are spawning next.
+        let token = self.0.token();
+
+        if self.0.is_closed() {
+            warn!(
+                %tmp_path,
+                "trying to spawn background purge during shutdown, ignoring"
+            );
+            return;
+        }
+
+        let span = info_span!(parent: None, "background_purge", %tmp_path);
+
+        let task = move || {
+            let _token = token;
+            let _entered = span.entered();
+            if let Err(error) = std::fs::remove_dir_all(tmp_path.as_path()) {
+                // should we fatal_io_error here?
+                warn!(%error, "failed to purge tenant directory");
             }
         };
-        jset.spawn_on(
-            async move {
-                if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
-                    // should we fatal_io_error here?
-                    warn!(%error, path=%tmp_path, "failed to purge tenant directory");
-                }
-            }
-            .instrument(info_span!(parent: None, "background_purge")),
-            BACKGROUND_RUNTIME.handle(),
-        );
+
+        BACKGROUND_RUNTIME.spawn_blocking(task);
     }
 
     /// When this future completes, all background purges have completed.
@@ -278,42 +273,9 @@ impl BackgroundPurges {
     /// instances of this future will continue to be correct.
     #[instrument(skip_all)]
     pub async fn shutdown(&self) {
-        let jset = {
-            let mut guard = self.0.lock().unwrap();
-            match &mut *guard {
-                BackgroundPurgesInner::Open(jset) => {
-                    *guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
-                        std::mem::take(jset),
-                    )))
-                }
-                BackgroundPurgesInner::ShuttingDown(_) => {
-                    // calling shutdown multiple times is most likely a bug in pageserver shutdown code
-                    warn!("already shutting down");
-                }
-            };
-            match &mut *guard {
-                BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
-                BackgroundPurgesInner::Open(_) => {
-                    unreachable!("above code transitions into shut down state");
-                }
-            }
-        };
-        let mut jset = jset.lock().await; // concurrent callers coalesce here
-        while let Some(res) = jset.join_next().await {
-            match res {
-                Ok(()) => {}
-                Err(e) if e.is_panic() => {
-                    // If it panicked, the error is already logged by the panic hook.
-                }
-                Err(e) if e.is_cancelled() => {
-                    unreachable!("we don't cancel the joinset or runtime")
-                }
-                Err(e) => {
-                    // No idea when this can happen, but let's log it.
-                    warn!(%e, "background purge task failed or panicked");
-                }
-            }
-        }
+        // forbid new tasks (can be called many times)
+        self.0.close();
+        self.0.wait().await;
     }
 }
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index c01b3a2e89..dadf5ca672 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -128,6 +128,8 @@ def test_tenant_delete_smoke(
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
 
+    env.pageserver.stop()
+
 
 def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
     """Reproduction of 2023-11-23 stuck tenants investigation"""
@@ -200,11 +202,10 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
         if deletion is not None:
             deletion.join()
 
+    env.pageserver.stop()
 
-def test_tenant_delete_races_timeline_creation(
-    neon_env_builder: NeonEnvBuilder,
-    pg_bin: PgBin,
-):
+
+def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder):
     """
     Validate that timeline creation executed in parallel with deletion works correctly.
 
@@ -318,6 +319,8 @@ def test_tenant_delete_races_timeline_creation(
     # We deleted our only tenant, and the scrubber fails if it detects nothing
     neon_env_builder.disable_scrub_on_exit()
 
+    env.pageserver.stop()
+
 
 def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
     """

From 56077caaf92a4a776eb51a169dcd3f113276f84c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 8 Aug 2024 12:57:48 +0100
Subject: [PATCH 35/48] pageserver: remove paranoia double-calculation of
 retain_lsns (#8617)

## Problem

This code was to mitigate risk in
https://github.com/neondatabase/neon/pull/8427

As expected, we did not hit this code path - the new continuous updates
of gc_info are working fine, we can remove this code now.

## Summary of changes

- Remove block that double-checks retain_lsns
---
 pageserver/src/tenant.rs | 48 ----------------------------------------
 1 file changed, 48 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2422ab4cf2..90c0e28bc4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3012,54 +3012,6 @@ impl Tenant {
         // because that will stall branch creation.
         let gc_cs = self.gc_cs.lock().await;
 
-        // Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they
-        // depend on.  So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here
-        // and fail out if it's inaccurate.
-        // (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427)
-        {
-            let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> =
-                BTreeMap::new();
-            timelines.iter().for_each(|timeline| {
-                if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() {
-                    let ancestor_children =
-                        all_branchpoints.entry(*ancestor_timeline_id).or_default();
-                    ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id));
-                }
-            });
-
-            for timeline in &timelines {
-                let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
-                    .remove(&timeline.timeline_id)
-                    .unwrap_or_default();
-
-                branchpoints.sort_by_key(|b| b.0);
-
-                let target = timeline.gc_info.read().unwrap();
-
-                // We require that retain_lsns contains everything in `branchpoints`, but not that
-                // they are exactly equal: timeline deletions can race with us, so retain_lsns
-                // may contain some extra stuff.  It is safe to have extra timelines in there, because it
-                // just means that we retain slightly more data than we otherwise might.
-                let have_branchpoints = target.retain_lsns.iter().copied().collect::<HashSet<_>>();
-                for b in &branchpoints {
-                    if !have_branchpoints.contains(b) {
-                        tracing::error!(
-                            "Bug: `retain_lsns` is set incorrectly.  Expected be {:?}, but found {:?}",
-                            branchpoints,
-                            target.retain_lsns
-                        );
-                        debug_assert!(false);
-                        // Do not GC based on bad information!
-                        // (ab-use an existing GcError type rather than adding a new one, since this is a
-                        // "should never happen" check that will be removed soon).
-                        return Err(GcError::Remote(anyhow::anyhow!(
-                            "retain_lsns failed validation!"
-                        )));
-                    }
-                }
-            }
-        }
-
         // Ok, we now know all the branch points.
         // Update the GC information for each timeline.
         let mut gc_timelines = Vec::with_capacity(timelines.len());

From 857ad70b71ffaebcfdea27cae5d4298b29dd6b3f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 8 Aug 2024 19:24:21 +0100
Subject: [PATCH 36/48] tests: don't require kafka client for regular tests
 (#8662)

## Problem

We're adding more third party dependencies to support more diverse +
realistic test cases in `test_runner/logical_repl`. I :heart: these
tests, they are a good thing.

The slight glitch is that python packaging is hard, and some third party
python packages have issues. For example the current kafka dependency
doesn't work on latest python. We can mitigate that by only importing
these more specialized dependencies in the tests that use them.

## Summary of changes

- Move the `kafka` import into a test body, so that folks running the
regular `test_runner/regress` tests don't have to have a working kafka
client package.
---
 test_runner/logical_repl/test_debezium.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py
index 700b731418..5426a06ca1 100644
--- a/test_runner/logical_repl/test_debezium.py
+++ b/test_runner/logical_repl/test_debezium.py
@@ -12,7 +12,6 @@ import requests
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import RemotePostgres
 from fixtures.utils import wait_until
-from kafka import KafkaConsumer
 
 
 class DebeziumAPI:
@@ -95,6 +94,8 @@ def debezium(remote_pg: RemotePostgres):
     log.debug("%s %s %s", resp.status_code, resp.ok, resp.text)
     assert resp.status_code == 201
     assert len(dbz.list_connectors()) == 1
+    from kafka import KafkaConsumer
+
     consumer = KafkaConsumer(
         "dbserver1.inventory.customers",
         bootstrap_servers=["kafka:9092"],

From c725a3e4b146b6db6a546091a10c0571e70a5b0d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 9 Aug 2024 07:17:16 +0100
Subject: [PATCH 37/48] CI(build-tools): update Rust, Python, Mold (#8667)

## Problem
- Rust 1.80.1 has been released:
https://blog.rust-lang.org/2024/08/08/Rust-1.80.1.html
- Python 3.9.19 has been released:
https://www.python.org/downloads/release/python-3919/
- Mold 2.33.0 has been released:
https://github.com/rui314/mold/releases/tag/v2.33.0
- Unpinned `cargo-deny` in `build-tools` got updated to the latest
version and doesn't work anymore with the current config file

## Summary of changes
- Bump Rust to 1.80.1
- Bump Python to 3.9.19
- Bump Mold to 2.33.0
- Pin `cargo-deny`, `cargo-hack`, `cargo-hakari`, `cargo-nextest`,
`rustfilt` versions
- Update `deny.toml` to the latest format, see
https://github.com/EmbarkStudios/cargo-deny/pull/611
---
 Dockerfile.build-tools | 21 +++++++++++++--------
 deny.toml              | 10 ++--------
 rust-toolchain.toml    |  2 +-
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index dfaab1cb2e..a72092e8e2 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
     && rm awscliv2.zip
 
 # Mold: A Modern Linker
-ENV MOLD_VERSION v2.31.0
+ENV MOLD_VERSION=v2.33.0
 RUN set -e \
     && git clone https://github.com/rui314/mold.git \
     && mkdir mold/build \
@@ -168,7 +168,7 @@ USER nonroot:nonroot
 WORKDIR /home/nonroot
 
 # Python
-ENV PYTHON_VERSION=3.9.18 \
+ENV PYTHON_VERSION=3.9.19 \
     PYENV_ROOT=/home/nonroot/.pyenv \
     PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
 RUN set -e \
@@ -192,9 +192,14 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.80.0
+ENV RUSTC_VERSION=1.80.1
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
+ARG RUSTFILT_VERSION=0.2.1
+ARG CARGO_HAKARI_VERSION=0.9.30
+ARG CARGO_DENY_VERSION=0.16.1
+ARG CARGO_HACK_VERSION=0.6.31
+ARG CARGO_NEXTEST_VERSION=0.9.72
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -204,11 +209,11 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     cargo --version && rustup --version && \
     rustup component add llvm-tools-preview rustfmt clippy && \
     cargo install --git https://github.com/paritytech/cachepot && \
-    cargo install rustfilt && \
-    cargo install cargo-hakari && \
-    cargo install cargo-deny --locked && \
-    cargo install cargo-hack && \
-    cargo install cargo-nextest && \
+    cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
+    cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
+    cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
+    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
+    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
     rm -rf /home/nonroot/.cargo/registry && \
     rm -rf /home/nonroot/.cargo/git
 ENV RUSTC_WRAPPER=cachepot
diff --git a/deny.toml b/deny.toml
index 469609c496..dc985138e6 100644
--- a/deny.toml
+++ b/deny.toml
@@ -4,6 +4,7 @@
 # to your expectations and requirements.
 
 # Root options
+[graph]
 targets = [
     { triple = "x86_64-unknown-linux-gnu" },
     { triple = "aarch64-unknown-linux-gnu" },
@@ -12,6 +13,7 @@ targets = [
 ]
 all-features = false
 no-default-features = false
+[output]
 feature-depth = 1
 
 # This section is considered when running `cargo deny check advisories`
@@ -19,17 +21,13 @@ feature-depth = 1
 # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
 [advisories]
 db-urls = ["https://github.com/rustsec/advisory-db"]
-vulnerability = "deny"
-unmaintained = "warn"
 yanked = "warn"
-notice = "warn"
 ignore = []
 
 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
 [licenses]
-unlicensed = "deny"
 allow = [
     "Apache-2.0",
     "Artistic-2.0",
@@ -42,10 +40,6 @@ allow = [
     "OpenSSL",
     "Unicode-DFS-2016",
 ]
-deny = []
-copyleft = "warn"
-allow-osi-fsf-free = "neither"
-default = "deny"
 confidence-threshold = 0.8
 exceptions = [
     # Zlib license has some restrictions if we decide to change sth
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 3510359591..368b8d300a 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.80.0"
+channel = "1.80.1"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From af8c86590322b2bd906ef80ec83b24da931203ab Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 9 Aug 2024 07:54:54 +0100
Subject: [PATCH 38/48] Dockerfiles: fix LegacyKeyValueFormat &
 JSONArgsRecommended  (#8664)

## Problem
CI complains in all PRs:
```
"ENV key=value" should be used instead of legacy "ENV key value" format
```
https://docs.docker.com/reference/build-checks/legacy-key-value-format/

See
- https://github.com/neondatabase/neon/pull/8644/files ("Unchanged files
with check annotations" section)
- https://github.com/neondatabase/neon/actions/runs/10304090562?pr=8644
("Annotations" section)


## Summary of changes
- Use `ENV key=value` instead of `ENV key value` in all Dockerfiles
---
 Dockerfile              |  6 +++---
 Dockerfile.build-tools  |  2 +-
 Dockerfile.compute-node | 28 ++++++++++++++--------------
 vm-image-spec.yaml      |  4 ++--
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ace112cccf..6ed57a84a3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -17,7 +17,7 @@ COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
 
-ENV BUILD_TYPE release
+ENV BUILD_TYPE=release
 RUN set -e \
     && mold -run make -j $(nproc) -s neon-pg-ext \
     && rm -rf pg_install/build \
@@ -104,7 +104,7 @@ RUN mkdir -p /data/.neon/ && \
 
 # When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
 # that want a particular postgres version will select it explicitly: this is just a default.
-ENV LD_LIBRARY_PATH /usr/local/v16/lib
+ENV LD_LIBRARY_PATH=/usr/local/v16/lib
 
 
 VOLUME ["/data"]
@@ -112,5 +112,5 @@ USER neon
 EXPOSE 6400
 EXPOSE 9898
 
-CMD /usr/local/bin/pageserver -D /data/.neon
+CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"]
 
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index a72092e8e2..d39d36e1b6 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -58,7 +58,7 @@ RUN set -e \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
 # protobuf-compiler (protoc)
-ENV PROTOC_VERSION 25.1
+ENV PROTOC_VERSION=25.1
 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
     && unzip -q protoc.zip -d protoc \
     && mv protoc/bin/protoc /usr/local/bin/protoc \
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 054d44e0ec..7acaf2f2fd 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -94,7 +94,7 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
     DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
     make clean && cp -R /sfcgal/* /
 
-ENV PATH "/usr/local/pgsql/bin:$PATH"
+ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
     echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
@@ -411,7 +411,7 @@ FROM build-deps AS timescaledb-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ARG PG_VERSION
-ENV PATH "/usr/local/pgsql/bin:$PATH"
+ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
@@ -444,7 +444,7 @@ FROM build-deps AS pg-hint-plan-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ARG PG_VERSION
-ENV PATH "/usr/local/pgsql/bin:$PATH"
+ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 RUN case "${PG_VERSION}" in \
       "v14") \
@@ -480,7 +480,7 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS pg-cron-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
     echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
@@ -506,7 +506,7 @@ RUN apt-get update && \
         libboost-system1.74-dev \
         libeigen3-dev
 
-ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
     echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
     mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
@@ -546,7 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
 FROM build-deps AS pg-uuidv7-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
     echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
     mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
@@ -563,7 +563,7 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz
 FROM build-deps AS pg-roaringbitmap-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
     echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
     mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
@@ -580,7 +580,7 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
 FROM build-deps AS pg-semver-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
     echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
     mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
@@ -598,7 +598,7 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ARG PG_VERSION
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export PG_EMBEDDING_VERSION=0.3.5 \
@@ -622,7 +622,7 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
     mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
@@ -750,7 +750,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -
 FROM build-deps AS wal2json-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
     echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
     mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
@@ -766,7 +766,7 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
 FROM build-deps AS pg-ivm-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
     echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
     mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
@@ -783,7 +783,7 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
 FROM build-deps AS pg-partman-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
     echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
     mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
@@ -1034,6 +1034,6 @@ RUN apt update &&  \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
 
-ENV LANG en_US.utf8
+ENV LANG=en_US.utf8
 USER postgres
 ENTRYPOINT ["/usr/local/bin/compute_ctl"]
diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 7d005c7139..41d6e11725 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -416,7 +416,7 @@ build: |
   # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
   # requires cgroup v2, so we'll build cgroup-tools ourselves.
   FROM debian:bullseye-slim as libcgroup-builder
-  ENV LIBCGROUP_VERSION v2.0.3
+  ENV LIBCGROUP_VERSION=v2.0.3
 
   RUN set -exu \
       && apt update \
@@ -460,7 +460,7 @@ build: |
           pkg-config
 
   # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-  ENV PGBOUNCER_TAG pgbouncer_1_22_1
+  ENV PGBOUNCER_TAG=pgbouncer_1_22_1
   RUN set -e \
       && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
       && cd pgbouncer \

From a8eebdb07210ccf235122e2a9a3ea228e4d887f9 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Fri, 9 Aug 2024 09:36:29 +0200
Subject: [PATCH 39/48] Run a subset of benchmarking job steps on GitHub action
 runners in Azure - closer to the system under test (#8651)

## Problem

Latency from one cloud provider to another one is higher than within the
same cloud provider.
Some of our benchmarks are latency sensitive - we run a pgbench or psql
in the github action runner and the system under test is running in Neon
(database project).
For realistic perf tps and latency results we need to compare apples to
apples and run the database client in the same "latency distance" for
all tests.

## Summary of changes

Move job steps that test Neon databases deployed on Azure into Azure
action runners.
- bench strategy variant using azure database
- pgvector strategy variant using azure database
- pgbench-compare strategy variants using azure database

## Test run

https://github.com/neondatabase/neon/actions/runs/10314848502
---
 .github/actionlint.yml             |  1 +
 .github/workflows/benchmarking.yml | 88 +++++++++++++++++++++++-------
 2 files changed, 69 insertions(+), 20 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index d27fa01efa..a5282876d0 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -13,3 +13,4 @@ config-variables:
   - REMOTE_STORAGE_AZURE_CONTAINER
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_UPCOMING_RELEASE_CHANNEL_ID
+  - DEV_AWS_OIDC_ROLE_ARN
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 0f4dac841e..6f80d6e431 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -56,6 +56,10 @@ concurrency:
 jobs:
   bench:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # Required for OIDC authentication in azure runners
     strategy:
       fail-fast: false
       matrix:
@@ -63,9 +67,13 @@ jobs:
           - DEFAULT_PG_VERSION: 16
             PLATFORM: "neon-staging"
             region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
           - DEFAULT_PG_VERSION: 16
             PLATFORM: "azure-staging"
             region_id: 'azure-eastus2'
+            RUNNER: [ self-hosted, eastus2, x64 ]
+            IMAGE: neondatabase/build-tools:pinned
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "300"
       TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -76,14 +84,21 @@ jobs:
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.PLATFORM }}
 
-    runs-on: [ self-hosted, us-east-2, x64 ]
+    runs-on: ${{ matrix.RUNNER }}
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ matrix.IMAGE }}
       options: --init
 
     steps:
     - uses: actions/checkout@v4
 
+    - name: Configure AWS credentials # necessary on Azure runners
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}  
+        role-duration-seconds: 18000 # 5 hours
+
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -161,6 +176,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
+    
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -237,6 +253,9 @@ jobs:
       id: pgbench-compare-matrix
       run: |
         region_id_default=${{ env.DEFAULT_REGION_ID }}
+        runner_default='["self-hosted", "us-east-2", "x64"]'
+        runner_azure='["self-hosted", "eastus2", "x64"]'
+        image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned"
         matrix='{
           "pg_version" : [
             16
@@ -250,16 +269,19 @@ jobs:
             "neonvm-captest-new"
           ],
           "db_size": [ "10gb" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "runner": ['"$runner_default"'],
+          "image": [ "'"$image_default"'" ],
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
         }'
 
         if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -302,6 +324,10 @@ jobs:
   pgbench-compare:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     needs: [ generate-matrices ]
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # Required for OIDC authentication in azure runners
 
     strategy:
       fail-fast: false
@@ -317,9 +343,9 @@ jobs:
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.platform }}
 
-    runs-on: [ self-hosted, us-east-2, x64 ]
+    runs-on: ${{ matrix.runner }}
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ matrix.image }}
       options: --init
 
     # Increase timeout to 8h, default timeout is 6h
@@ -328,6 +354,13 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
+    - name: Configure AWS credentials # necessary on Azure runners
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+        
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -435,12 +468,20 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   pgbench-pgvector:
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # Required for OIDC authentication in azure runners
     strategy:
       fail-fast: false
       matrix:
         include:
           - PLATFORM: "neonvm-captest-pgvector"
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
           - PLATFORM: "azure-captest-pgvector"
+            RUNNER: [ self-hosted, eastus2, x64 ]
+            IMAGE: neondatabase/build-tools:pinned
 
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
@@ -453,9 +494,9 @@ jobs:
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.PLATFORM }}
 
-    runs-on: [ self-hosted, us-east-2, x64 ]
+    runs-on: ${{ matrix.RUNNER }}
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ matrix.IMAGE }}
       options: --init
 
     steps:
@@ -466,12 +507,12 @@ jobs:
     - name: Install postgresql-16 where pytest expects it
       run: |
         cd /home/nonroot
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb 
-        dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb 
+        dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg
         mkdir -p /tmp/neon/pg_install/v16/bin
         ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench  
         ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql  
@@ -496,6 +537,13 @@ jobs:
         esac
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+        
+    - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
 
     - name: Benchmark pgvector hnsw indexing
       uses: ./.github/actions/run-python-test-set
@@ -524,7 +572,7 @@ jobs:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-
+    
     - name: Create Allure report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate

From 9dfed93f70dd86fe77b84eb1efa88d9676b51fba Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 9 Aug 2024 09:09:29 +0100
Subject: [PATCH 40/48] Revert "proxy: update tokio-postgres to allow arbitrary
 config params (#8076)" (#8654)

This reverts #8076 - which was already reverted from the release branch
since forever (it would have been a breaking change to release for all
users who currently set TimeZone options). It's causing conflicts now so
we should revert it here as well.
---
 Cargo.lock                            |   8 +-
 libs/postgres_connection/src/lib.rs   |  50 +++++-----
 proxy/src/compute.rs                  | 129 ++++++++++++--------------
 proxy/src/serverless/backend.rs       |   4 -
 proxy/src/serverless/sql_over_http.rs |   1 -
 test_runner/regress/test_proxy.py     |  19 ----
 6 files changed, 92 insertions(+), 119 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f565119dbd..031fae0f37 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3960,7 +3960,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -3973,7 +3973,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -3992,7 +3992,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -6187,7 +6187,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs
index fdabcbacb2..9f57f3d507 100644
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -144,7 +144,20 @@ impl PgConnectionConfig {
             // implement and this function is hardly a bottleneck. The function is only called around
             // establishing a new connection.
             #[allow(unstable_name_collisions)]
-            config.options(&encode_options(&self.options));
+            config.options(
+                &self
+                    .options
+                    .iter()
+                    .map(|s| {
+                        if s.contains(['\\', ' ']) {
+                            Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
+                        } else {
+                            Cow::Borrowed(s.as_str())
+                        }
+                    })
+                    .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
+                    .collect::<String>(),
+            );
         }
         config
     }
@@ -165,21 +178,6 @@ impl PgConnectionConfig {
     }
 }
 
-#[allow(unstable_name_collisions)]
-fn encode_options(options: &[String]) -> String {
-    options
-        .iter()
-        .map(|s| {
-            if s.contains(['\\', ' ']) {
-                Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
-            } else {
-                Cow::Borrowed(s.as_str())
-            }
-        })
-        .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
-        .collect::<String>()
-}
-
 impl fmt::Display for PgConnectionConfig {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // The password is intentionally hidden and not part of this display string.
@@ -208,7 +206,7 @@ impl fmt::Debug for PgConnectionConfig {
 
 #[cfg(test)]
 mod tests_pg_connection_config {
-    use crate::{encode_options, PgConnectionConfig};
+    use crate::PgConnectionConfig;
     use once_cell::sync::Lazy;
     use url::Host;
 
@@ -257,12 +255,18 @@ mod tests_pg_connection_config {
 
     #[test]
     fn test_with_options() {
-        let options = encode_options(&[
-            "hello".to_owned(),
-            "world".to_owned(),
-            "with space".to_owned(),
-            "and \\ backslashes".to_owned(),
+        let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([
+            "hello",
+            "world",
+            "with space",
+            "and \\ backslashes",
         ]);
-        assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes");
+        assert_eq!(cfg.host(), &*STUB_HOST);
+        assert_eq!(cfg.port(), 123);
+        assert_eq!(cfg.raw_address(), "stub.host.example:123");
+        assert_eq!(
+            cfg.to_tokio_postgres_config().get_options(),
+            Some("hello world with\\ space and\\ \\\\\\ backslashes")
+        );
     }
 }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 21687160ea..18c82fe379 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -103,8 +103,12 @@ impl ConnCfg {
 
     /// Reuse password or auth keys from the other config.
     pub fn reuse_password(&mut self, other: Self) {
-        if let Some(password) = other.get_auth() {
-            self.auth(password);
+        if let Some(password) = other.get_password() {
+            self.password(password);
+        }
+
+        if let Some(keys) = other.get_auth_keys() {
+            self.auth_keys(keys);
         }
     }
 
@@ -120,64 +124,48 @@ impl ConnCfg {
 
     /// Apply startup message params to the connection config.
     pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
-        let mut client_encoding = false;
-        for (k, v) in params.iter() {
-            match k {
-                "user" => {
-                    // Only set `user` if it's not present in the config.
-                    // Link auth flow takes username from the console's response.
-                    if self.get_user().is_none() {
-                        self.user(v);
-                    }
+        // Only set `user` if it's not present in the config.
+        // Link auth flow takes username from the console's response.
+        if let (None, Some(user)) = (self.get_user(), params.get("user")) {
+            self.user(user);
+        }
+
+        // Only set `dbname` if it's not present in the config.
+        // Link auth flow takes dbname from the console's response.
+        if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
+            self.dbname(dbname);
+        }
+
+        // Don't add `options` if they were only used for specifying a project.
+        // Connection pools don't support `options`, because they affect backend startup.
+        if let Some(options) = filtered_options(params) {
+            self.options(&options);
+        }
+
+        if let Some(app_name) = params.get("application_name") {
+            self.application_name(app_name);
+        }
+
+        // TODO: This is especially ugly...
+        if let Some(replication) = params.get("replication") {
+            use tokio_postgres::config::ReplicationMode;
+            match replication {
+                "true" | "on" | "yes" | "1" => {
+                    self.replication_mode(ReplicationMode::Physical);
                 }
                 "database" => {
-                    // Only set `dbname` if it's not present in the config.
-                    // Link auth flow takes dbname from the console's response.
-                    if self.get_dbname().is_none() {
-                        self.dbname(v);
-                    }
-                }
-                "options" => {
-                    // Don't add `options` if they were only used for specifying a project.
-                    // Connection pools don't support `options`, because they affect backend startup.
-                    if let Some(options) = filtered_options(v) {
-                        self.options(&options);
-                    }
-                }
-
-                // the special ones in tokio-postgres that we don't want being set by the user
-                "dbname" => {}
-                "password" => {}
-                "sslmode" => {}
-                "host" => {}
-                "port" => {}
-                "connect_timeout" => {}
-                "keepalives" => {}
-                "keepalives_idle" => {}
-                "keepalives_interval" => {}
-                "keepalives_retries" => {}
-                "target_session_attrs" => {}
-                "channel_binding" => {}
-                "max_backend_message_size" => {}
-
-                "client_encoding" => {
-                    client_encoding = true;
-                    // only error should be from bad null bytes,
-                    // but we've already checked for those.
-                    _ = self.param("client_encoding", v);
-                }
-
-                _ => {
-                    // only error should be from bad null bytes,
-                    // but we've already checked for those.
-                    _ = self.param(k, v);
+                    self.replication_mode(ReplicationMode::Logical);
                 }
+                _other => {}
             }
         }
-        if !client_encoding {
-            // for compatibility since we removed it from tokio-postgres
-            self.param("client_encoding", "UTF8").unwrap();
-        }
+
+        // TODO: extend the list of the forwarded startup parameters.
+        // Currently, tokio-postgres doesn't allow us to pass
+        // arbitrary parameters, but the ones above are a good start.
+        //
+        // This and the reverse params problem can be better addressed
+        // in a bespoke connection machinery (a new library for that sake).
     }
 }
 
@@ -350,9 +338,10 @@ impl ConnCfg {
 }
 
 /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
-fn filtered_options(options: &str) -> Option<String> {
+fn filtered_options(params: &StartupMessageParams) -> Option<String> {
     #[allow(unstable_name_collisions)]
-    let options: String = StartupMessageParams::parse_options_raw(options)
+    let options: String = params
+        .options_raw()?
         .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
         .intersperse(" ") // TODO: use impl from std once it's stabilized
         .collect();
@@ -424,23 +413,27 @@ mod tests {
     #[test]
     fn test_filtered_options() {
         // Empty options is unlikely to be useful anyway.
-        assert_eq!(filtered_options(""), None);
+        let params = StartupMessageParams::new([("options", "")]);
+        assert_eq!(filtered_options(&params), None);
 
         // It's likely that clients will only use options to specify endpoint/project.
-        let params = "project=foo";
-        assert_eq!(filtered_options(params), None);
+        let params = StartupMessageParams::new([("options", "project=foo")]);
+        assert_eq!(filtered_options(&params), None);
 
         // Same, because unescaped whitespaces are no-op.
-        let params = " project=foo ";
-        assert_eq!(filtered_options(params), None);
+        let params = StartupMessageParams::new([("options", " project=foo ")]);
+        assert_eq!(filtered_options(&params).as_deref(), None);
 
-        let params = r"\  project=foo \ ";
-        assert_eq!(filtered_options(params).as_deref(), Some(r"\  \ "));
+        let params = StartupMessageParams::new([("options", r"\  project=foo \ ")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some(r"\  \ "));
 
-        let params = "project = foo";
-        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
+        let params = StartupMessageParams::new([("options", "project = foo")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
 
-        let params = "project = foo neon_endpoint_type:read_write   neon_lsn:0/2";
-        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
+        let params = StartupMessageParams::new([(
+            "options",
+            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
+        )]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
     }
 }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 80d46c67eb..295ea1a1c7 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -236,10 +236,6 @@ impl ConnectMechanism for TokioMechanism {
             .dbname(&self.conn_info.dbname)
             .connect_timeout(timeout);
 
-        config
-            .param("client_encoding", "UTF8")
-            .expect("client encoding UTF8 is always valid");
-
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 77ec6b1c73..e5b6536328 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -203,7 +203,6 @@ fn get_conn_info(
             options = Some(NeonOptions::parse_options_raw(&value));
         }
     }
-    ctx.set_db_options(params.freeze());
 
     let user_info = ComputeUserInfo {
         endpoint,
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 8ed44b1094..f446f4f200 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -53,25 +53,6 @@ def test_proxy_select_1(static_proxy: NeonProxy):
     assert out[0][0] == 42
 
 
-def test_proxy_server_params(static_proxy: NeonProxy):
-    """
-    Test that server params are passing through to postgres
-    """
-
-    out = static_proxy.safe_psql(
-        "select to_json('0 seconds'::interval)", options="-c intervalstyle=iso_8601"
-    )
-    assert out[0][0] == "PT0S"
-    out = static_proxy.safe_psql(
-        "select to_json('0 seconds'::interval)", options="-c intervalstyle=sql_standard"
-    )
-    assert out[0][0] == "0"
-    out = static_proxy.safe_psql(
-        "select to_json('0 seconds'::interval)", options="-c intervalstyle=postgres"
-    )
-    assert out[0][0] == "00:00:00"
-
-
 def test_password_hack(static_proxy: NeonProxy):
     """
     Check the PasswordHack auth flow: an alternative to SCRAM auth for

From f8c0da43b5b2ed68d0a7d3cce66cb09ab93575f5 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Fri, 9 Aug 2024 16:18:55 +0800
Subject: [PATCH 41/48] fix(neon): disable create tablespace stmt (#8657)

part of https://github.com/neondatabase/neon/issues/8653

Disable create tablespace stmt. It turns out it requires much less
effort to do the regress test mode flag than patching the test cases,
and given that we might need to support tablespaces in the future, I
decided to add a new flag `regress_test_mode` to change the behavior of
create tablespace.

Tested manually that without setting regress_test_mode, create
tablespace will be rejected.



---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pgxn/neon/control_plane_connector.c    | 21 +++++++++++++++++++++
 test_runner/regress/test_pg_regress.py | 25 ++++++++++++++++++++++---
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 93252e6b29..de023da5c4 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -45,6 +45,7 @@ static const char *jwt_token = NULL;
 /* GUCs */
 static char *ConsoleURL = NULL;
 static bool ForwardDDL = true;
+static bool RegressTestMode = false;
 
 /*
  * CURL docs say that this buffer must exist until we call curl_easy_cleanup
@@ -802,6 +803,14 @@ NeonProcessUtility(
 		case T_DropRoleStmt:
 			HandleDropRole(castNode(DropRoleStmt, parseTree));
 			break;
+		case T_CreateTableSpaceStmt:
+			if (!RegressTestMode)
+			{
+				ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					errmsg("CREATE TABLESPACE is not supported on Neon")));
+			}
+   			break;
 		default:
 			break;
 	}
@@ -864,6 +873,18 @@ InitControlPlaneConnector()
 							 NULL,
 							 NULL);
 
+	DefineCustomBoolVariable(
+							 "neon.regress_test_mode",
+							 "Controls whether we are running in the regression test mode",
+							 NULL,
+							 &RegressTestMode,
+							 false,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
 	jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
 	if (!jwt_token)
 	{
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 6f7ea0092a..45ce5b1c5b 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -144,7 +144,13 @@ def test_pg_regress(
     )
 
     # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start("main")
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            # Enable the test mode, so that we don't need to patch the test cases.
+            "neon.regress_test_mode = true",
+        ],
+    )
     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_regress to run in.
@@ -207,7 +213,14 @@ def test_isolation(
 
     # Connect to postgres and create a database called "regression".
     # isolation tests use prepared transactions, so enable them
-    endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"])
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "max_prepared_transactions=100",
+            # Enable the test mode, so that we don't need to patch the test cases.
+            "neon.regress_test_mode = true",
+        ],
+    )
     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_isolation_regress to run in.
@@ -268,7 +281,13 @@ def test_sql_regress(
     )
 
     # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start("main")
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            # Enable the test mode, so that we don't need to patch the test cases.
+            "neon.regress_test_mode = true",
+        ],
+    )
     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_regress to run in.

From cbba8e339036e98f632407f194c0e947d133daa9 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 9 Aug 2024 12:05:43 +0100
Subject: [PATCH 42/48] CI(pin-build-tools-image): fix permissions for Azure
 login (#8671)

## Problem

Azure login fails in `pin-build-tools-image` workflow because the job
doesn't have the required permissions.

```
Error: Please make sure to give write permissions to id-token in the workflow.
Error: Login failed with Error: Error message: Unable to get ACTIONS_ID_TOKEN_REQUEST_URL env variable. Double check if the 'auth-type' is correct. Refer to https://github.com/Azure/login#readme for more information.
```

## Summary of changes
- Add `id-token: write` permission to `pin-build-tools-image`
- Add an input to force image tagging
- Unify pushing to Docker Hub with other registries
- Split the job into two to have less if's
---
 .github/workflows/pin-build-tools-image.yml | 50 +++++++++++++--------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index cf10910b0b..2e79498fc4 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -7,12 +7,20 @@ on:
         description: 'Source tag'
         required: true
         type: string
+      force:
+        description: 'Force the image to be pinned'
+        default: false
+        type: boolean
   workflow_call:
     inputs:
       from-tag:
         description: 'Source tag'
         required: true
         type: string
+      force:
+        description: 'Force the image to be pinned'
+        default: false
+        type: boolean
 
 defaults:
   run:
@@ -22,15 +30,18 @@ concurrency:
   group: pin-build-tools-image-${{ inputs.from-tag }}
   cancel-in-progress: false
 
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
 
-jobs:
-  tag-image:
-    runs-on: ubuntu-22.04
+env:
+  FROM_TAG: ${{ inputs.from-tag }}
+  TO_TAG: pinned
 
-    env:
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: pinned
+jobs:
+  check-manifests:
+    runs-on: ubuntu-22.04
+    outputs:
+      skip: ${{ steps.check-manifests.outputs.skip }}
 
     steps:
       - name: Check if we really need to pin the image
@@ -47,27 +58,31 @@ jobs:
 
           echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
 
+  tag-image:
+    needs: check-manifests
+
+    # use format(..) to catch both inputs.force = true AND inputs.force = 'true'
+    if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true'
+
+    runs-on: ubuntu-22.04
+
+    permissions:
+      id-token: write # for `azure/login`
+
+    steps:
       - uses: docker/login-action@v3
-        if: steps.check-manifests.outputs.skip == 'false'
+
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
-        if: steps.check-manifests.outputs.skip == 'false'
-        run: |
-          docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
-                                             neondatabase/build-tools:${FROM_TAG}
-
       - uses: docker/login-action@v3
-        if: steps.check-manifests.outputs.skip == 'false'
         with:
           registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
           username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
           password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
       - name: Azure login
-        if: steps.check-manifests.outputs.skip == 'false'
         uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
         with:
           client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
@@ -75,13 +90,12 @@ jobs:
           subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
 
       - name: Login to ACR
-        if: steps.check-manifests.outputs.skip == 'false'
         run: |
           az acr login --name=neoneastus2
 
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR and ACR
-        if: steps.check-manifests.outputs.skip == 'false'
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
         run: |
           docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
                                           -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
+                                          -t neondatabase/build-tools:${TO_TAG} \
                                              neondatabase/build-tools:${FROM_TAG}

From e9a378d1aa72c245044d63f3b53193f424f507d4 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 9 Aug 2024 14:01:56 +0100
Subject: [PATCH 43/48] pageserver: don't treat NotInitialized::Stopped as
 unexpected (#8675)

## Problem

This type of error can happen during shutdown & was triggering a circuit
breaker alert.

## Summary of changes

- Map NotIntialized::Stopped to CompactionError::ShuttingDown, so that
we may handle it cleanly
---
 pageserver/src/tenant/timeline.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 76dcb5645f..f810df5a56 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4412,11 +4412,11 @@ impl From<CollectKeySpaceError> for CompactionError {
 impl From<super::upload_queue::NotInitialized> for CompactionError {
     fn from(value: super::upload_queue::NotInitialized) -> Self {
         match value {
-            super::upload_queue::NotInitialized::Uninitialized
-            | super::upload_queue::NotInitialized::Stopped => {
+            super::upload_queue::NotInitialized::Uninitialized => {
                 CompactionError::Other(anyhow::anyhow!(value))
             }
-            super::upload_queue::NotInitialized::ShuttingDown => CompactionError::ShuttingDown,
+            super::upload_queue::NotInitialized::ShuttingDown
+            | super::upload_queue::NotInitialized::Stopped => CompactionError::ShuttingDown,
         }
     }
 }

From 494023f5dffc6d74affa4fc4a4342c6489ff7178 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 9 Aug 2024 15:45:07 +0100
Subject: [PATCH 44/48] storcon: skip draining shard if it's secondary is
 lagging too much (#8644)

## Problem
Migrations of tenant shards with cold secondaries are holding up drains
in during production deployments.

## Summary of changes
If a secondary locations is lagging by more than 256MiB (configurable,
but that's the default), then skip cutting it over to the secondary as part of the node drain.
---
 control_plane/src/local_env.rs                |   3 +
 control_plane/src/storage_controller.rs       |   4 +
 pageserver/src/tenant/secondary/downloader.rs |   6 +-
 storage_controller/src/drain_utils.rs         | 225 ++++++++++++++++++
 storage_controller/src/lib.rs                 |   1 +
 storage_controller/src/main.rs                |   6 +
 storage_controller/src/reconciler.rs          |  74 +++++-
 storage_controller/src/service.rs             | 222 ++++++++++++-----
 storage_controller/src/tenant_shard.rs        |   4 +-
 test_runner/fixtures/neon_fixtures.py         |  64 +++++
 test_runner/fixtures/pageserver/http.py       |   6 +
 .../test_storage_controller_scale.py          |  48 +---
 .../regress/test_storage_controller.py        | 113 +++++++++
 13 files changed, 666 insertions(+), 110 deletions(-)
 create mode 100644 storage_controller/src/drain_utils.rs

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 505d157efd..15bbac702f 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -158,6 +158,8 @@ pub struct NeonStorageControllerConf {
 
     /// Threshold for auto-splitting a tenant into shards
     pub split_threshold: Option<u64>,
+
+    pub max_secondary_lag_bytes: Option<u64>,
 }
 
 impl NeonStorageControllerConf {
@@ -173,6 +175,7 @@ impl Default for NeonStorageControllerConf {
             max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
             max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
             split_threshold: None,
+            max_secondary_lag_bytes: None,
         }
     }
 }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index e054e9ee57..f180e922e8 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -383,6 +383,10 @@ impl StorageController {
             args.push(format!("--split-threshold={split_threshold}"))
         }
 
+        if let Some(lag) = self.config.max_secondary_lag_bytes.as_ref() {
+            args.push(format!("--max-secondary-lag-bytes={lag}"))
+        }
+
         args.push(format!(
             "--neon-local-repo-dir={}",
             self.env.base_data_dir.display()
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 27439d4f03..135e73b57f 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -55,7 +55,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
     backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
-    id::TimelineId, serde_system_time,
+    id::TimelineId, pausable_failpoint, serde_system_time,
 };
 
 use super::{
@@ -1146,12 +1146,14 @@ impl<'a> TenantDownloader<'a> {
         layer: HeatMapLayer,
         ctx: &RequestContext,
     ) -> Result<Option<HeatMapLayer>, UpdateError> {
-        // Failpoint for simulating slow remote storage
+        // Failpoints for simulating slow remote storage
         failpoint_support::sleep_millis_async!(
             "secondary-layer-download-sleep",
             &self.secondary_state.cancel
         );
 
+        pausable_failpoint!("secondary-layer-download-pausable");
+
         let local_path = local_layer_path(
             self.conf,
             tenant_shard_id,
diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs
new file mode 100644
index 0000000000..dea1f04649
--- /dev/null
+++ b/storage_controller/src/drain_utils.rs
@@ -0,0 +1,225 @@
+use std::{
+    collections::{BTreeMap, HashMap},
+    sync::Arc,
+};
+
+use pageserver_api::controller_api::NodeSchedulingPolicy;
+use utils::{id::NodeId, shard::TenantShardId};
+
+use crate::{
+    background_node_operations::OperationError, node::Node, scheduler::Scheduler,
+    tenant_shard::TenantShard,
+};
+
+pub(crate) struct TenantShardIterator<F> {
+    tenants_accessor: F,
+    inspected_all_shards: bool,
+    last_inspected_shard: Option<TenantShardId>,
+}
+
+/// A simple iterator which can be used in tandem with [`crate::service::Service`]
+/// to iterate over all known tenant shard ids without holding the lock on the
+/// service state at all times.
+impl<F> TenantShardIterator<F>
+where
+    F: Fn(Option<TenantShardId>) -> Option<TenantShardId>,
+{
+    pub(crate) fn new(tenants_accessor: F) -> Self {
+        Self {
+            tenants_accessor,
+            inspected_all_shards: false,
+            last_inspected_shard: None,
+        }
+    }
+
+    /// Returns the next tenant shard id if one exists
+    pub(crate) fn next(&mut self) -> Option<TenantShardId> {
+        if self.inspected_all_shards {
+            return None;
+        }
+
+        match (self.tenants_accessor)(self.last_inspected_shard) {
+            Some(tid) => {
+                self.last_inspected_shard = Some(tid);
+                Some(tid)
+            }
+            None => {
+                self.inspected_all_shards = true;
+                None
+            }
+        }
+    }
+
+    /// Returns true when the end of the iterator is reached and false otherwise
+    pub(crate) fn finished(&self) -> bool {
+        self.inspected_all_shards
+    }
+}
+
+/// Check that the state of the node being drained is as expected:
+/// node is present in memory and scheduling policy is set to [`NodeSchedulingPolicy::Draining`]
+pub(crate) fn validate_node_state(
+    node_id: &NodeId,
+    nodes: Arc<HashMap<NodeId, Node>>,
+) -> Result<(), OperationError> {
+    let node = nodes.get(node_id).ok_or(OperationError::NodeStateChanged(
+        format!("node {} was removed", node_id).into(),
+    ))?;
+
+    let current_policy = node.get_scheduling();
+    if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
+        // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
+        // about it
+        return Err(OperationError::NodeStateChanged(
+            format!("node {} changed state to {:?}", node_id, current_policy).into(),
+        ));
+    }
+
+    Ok(())
+}
+
+/// Struct that houses a few utility methods for draining pageserver nodes
+pub(crate) struct TenantShardDrain {
+    pub(crate) drained_node: NodeId,
+    pub(crate) tenant_shard_id: TenantShardId,
+}
+
+impl TenantShardDrain {
+    /// Check if the tenant shard under question is eligible for drainining:
+    /// it's primary attachment is on the node being drained
+    pub(crate) fn tenant_shard_eligible_for_drain(
+        &self,
+        tenants: &BTreeMap<TenantShardId, TenantShard>,
+        scheduler: &Scheduler,
+    ) -> Option<NodeId> {
+        let tenant_shard = tenants.get(&self.tenant_shard_id)?;
+
+        if *tenant_shard.intent.get_attached() != Some(self.drained_node) {
+            return None;
+        }
+
+        match scheduler.node_preferred(tenant_shard.intent.get_secondary()) {
+            Some(node) => Some(node),
+            None => {
+                tracing::warn!(
+                    tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
+                    "No eligible secondary while draining {}", self.drained_node
+                );
+
+                None
+            }
+        }
+    }
+
+    /// Attempt to reschedule the tenant shard under question to one of its secondary locations
+    /// Returns an Err when the operation should be aborted and Ok(None) when the tenant shard
+    /// should be skipped.
+    pub(crate) fn reschedule_to_secondary<'a>(
+        &self,
+        destination: NodeId,
+        tenants: &'a mut BTreeMap<TenantShardId, TenantShard>,
+        scheduler: &mut Scheduler,
+        nodes: &Arc<HashMap<NodeId, Node>>,
+    ) -> Result<Option<&'a mut TenantShard>, OperationError> {
+        let tenant_shard = match tenants.get_mut(&self.tenant_shard_id) {
+            Some(some) => some,
+            None => {
+                // Tenant shard was removed in the meantime.
+                // Skip to the next one, but don't fail the overall operation
+                return Ok(None);
+            }
+        };
+
+        if !nodes.contains_key(&destination) {
+            return Err(OperationError::NodeStateChanged(
+                format!("node {} was removed", destination).into(),
+            ));
+        }
+
+        if !tenant_shard.intent.get_secondary().contains(&destination) {
+            tracing::info!(
+                tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
+                "Secondary moved away from {destination} during drain"
+            );
+
+            return Ok(None);
+        }
+
+        match tenant_shard.reschedule_to_secondary(Some(destination), scheduler) {
+            Err(e) => {
+                tracing::warn!(
+                    tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
+                    "Scheduling error when draining pageserver {} : {}", self.drained_node, e
+                );
+
+                Ok(None)
+            }
+            Ok(()) => {
+                let scheduled_to = tenant_shard.intent.get_attached();
+                tracing::info!(
+                    tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
+                    "Rescheduled shard while draining node {}: {} -> {:?}",
+                    self.drained_node,
+                    self.drained_node,
+                    scheduled_to
+                );
+
+                Ok(Some(tenant_shard))
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use utils::{
+        id::TenantId,
+        shard::{ShardCount, ShardNumber, TenantShardId},
+    };
+
+    use super::TenantShardIterator;
+
+    #[test]
+    fn test_tenant_shard_iterator() {
+        let tenant_id = TenantId::generate();
+        let shard_count = ShardCount(8);
+
+        let mut tenant_shards = Vec::default();
+        for i in 0..shard_count.0 {
+            tenant_shards.push((
+                TenantShardId {
+                    tenant_id,
+                    shard_number: ShardNumber(i),
+                    shard_count,
+                },
+                (),
+            ))
+        }
+
+        let tenant_shards = Arc::new(tenant_shards);
+
+        let mut tid_iter = TenantShardIterator::new({
+            let tenants = tenant_shards.clone();
+            move |last_inspected_shard: Option<TenantShardId>| {
+                let entry = match last_inspected_shard {
+                    Some(skip_past) => {
+                        let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past);
+                        cursor.nth(1)
+                    }
+                    None => tenants.first(),
+                };
+
+                entry.map(|(tid, _)| tid).copied()
+            }
+        });
+
+        let mut iterated_over = Vec::default();
+        while let Some(tid) = tid_iter.next() {
+            iterated_over.push((tid, ()));
+        }
+
+        assert_eq!(iterated_over, *tenant_shards);
+    }
+}
diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs
index 8caf638904..26c258c466 100644
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -4,6 +4,7 @@ use utils::seqwait::MonotonicCounter;
 mod auth;
 mod background_node_operations;
 mod compute_hook;
+mod drain_utils;
 mod heartbeater;
 pub mod http;
 mod id_lock_map;
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 2799f21fdc..a66e9128bc 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -92,6 +92,11 @@ struct Cli {
     /// Chaos testing
     #[arg(long)]
     chaos_interval: Option<humantime::Duration>,
+
+    // Maximum acceptable lag for the secondary location while draining
+    // a pageserver
+    #[arg(long)]
+    max_secondary_lag_bytes: Option<u64>,
 }
 
 enum StrictMode {
@@ -279,6 +284,7 @@ async fn async_main() -> anyhow::Result<()> {
             .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
         split_threshold: args.split_threshold,
         neon_local_repo_dir: args.neon_local_repo_dir,
+        max_secondary_lag_bytes: args.max_secondary_lag_bytes,
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 254fdb364e..94db879ade 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -39,6 +39,9 @@ pub(super) struct Reconciler {
     /// to detach this tenant shard.
     pub(crate) detach: Vec<Node>,
 
+    /// Configuration specific to this reconciler
+    pub(crate) reconciler_config: ReconcilerConfig,
+
     pub(crate) config: TenantConfig,
     pub(crate) observed: ObservedState,
 
@@ -73,6 +76,65 @@ pub(super) struct Reconciler {
     pub(crate) persistence: Arc<Persistence>,
 }
 
+pub(crate) struct ReconcilerConfigBuilder {
+    config: ReconcilerConfig,
+}
+
+impl ReconcilerConfigBuilder {
+    pub(crate) fn new() -> Self {
+        Self {
+            config: ReconcilerConfig::default(),
+        }
+    }
+
+    pub(crate) fn secondary_warmup_timeout(self, value: Duration) -> Self {
+        Self {
+            config: ReconcilerConfig {
+                secondary_warmup_timeout: Some(value),
+                ..self.config
+            },
+        }
+    }
+
+    pub(crate) fn secondary_download_request_timeout(self, value: Duration) -> Self {
+        Self {
+            config: ReconcilerConfig {
+                secondary_download_request_timeout: Some(value),
+                ..self.config
+            },
+        }
+    }
+
+    pub(crate) fn build(self) -> ReconcilerConfig {
+        self.config
+    }
+}
+
+#[derive(Default, Debug, Copy, Clone)]
+pub(crate) struct ReconcilerConfig {
+    // During live migration give up on warming-up the secondary
+    // after this timeout.
+    secondary_warmup_timeout: Option<Duration>,
+
+    // During live migrations this is the amount of time that
+    // the pagserver will hold our poll.
+    secondary_download_request_timeout: Option<Duration>,
+}
+
+impl ReconcilerConfig {
+    pub(crate) fn get_secondary_warmup_timeout(&self) -> Duration {
+        const SECONDARY_WARMUP_TIMEOUT_DEFAULT: Duration = Duration::from_secs(300);
+        self.secondary_warmup_timeout
+            .unwrap_or(SECONDARY_WARMUP_TIMEOUT_DEFAULT)
+    }
+
+    pub(crate) fn get_secondary_download_request_timeout(&self) -> Duration {
+        const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT: Duration = Duration::from_secs(20);
+        self.secondary_download_request_timeout
+            .unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT)
+    }
+}
+
 /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
 pub(crate) struct ReconcileUnits {
     _sem_units: tokio::sync::OwnedSemaphorePermit,
@@ -300,11 +362,13 @@ impl Reconciler {
     ) -> Result<(), ReconcileError> {
         // This is not the timeout for a request, but the total amount of time we're willing to wait
         // for a secondary location to get up to date before
-        const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
+        let total_download_timeout = self.reconciler_config.get_secondary_warmup_timeout();
 
         // This the long-polling interval for the secondary download requests we send to destination pageserver
         // during a migration.
-        const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
+        let request_download_timeout = self
+            .reconciler_config
+            .get_secondary_download_request_timeout();
 
         let started_at = Instant::now();
 
@@ -315,14 +379,14 @@ impl Reconciler {
                         client
                             .tenant_secondary_download(
                                 tenant_shard_id,
-                                Some(REQUEST_DOWNLOAD_TIMEOUT),
+                                Some(request_download_timeout),
                             )
                             .await
                     },
                     &self.service_config.jwt_token,
                     1,
                     3,
-                    REQUEST_DOWNLOAD_TIMEOUT * 2,
+                    request_download_timeout * 2,
                     &self.cancel,
                 )
                 .await
@@ -350,7 +414,7 @@ impl Reconciler {
                 return Ok(());
             } else if status == StatusCode::ACCEPTED {
                 let total_runtime = started_at.elapsed();
-                if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
+                if total_runtime > total_download_timeout {
                     tracing::warn!("Timed out after {}ms downloading layers to {node}.  Progress so far: {}/{} layers, {}/{} bytes",
                         total_runtime.as_millis(),
                         progress.layers_downloaded,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index e391ce65e6..31b2d0c3f5 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -14,10 +14,11 @@ use crate::{
         Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION,
     },
     compute_hook::NotifyError,
+    drain_utils::{self, TenantShardDrain, TenantShardIterator},
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
     metrics::LeadershipStatusGroup,
     persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
-    reconciler::{ReconcileError, ReconcileUnits},
+    reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
     scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
     tenant_shard::{
         MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
@@ -325,6 +326,12 @@ pub struct Config {
 
     // TODO: make this cfg(feature  = "testing")
     pub neon_local_repo_dir: Option<PathBuf>,
+
+    // Maximum acceptable download lag for the secondary location
+    // while draining a node. If the secondary location is lagging
+    // by more than the configured amount, then the secondary is not
+    // upgraded to primary.
+    pub max_secondary_lag_bytes: Option<u64>,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -5187,11 +5194,22 @@ impl Service {
         Ok(())
     }
 
-    /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`],
+    /// Like [`Self::maybe_configured_reconcile_shard`], but uses the default reconciler
+    /// configuration
     fn maybe_reconcile_shard(
         &self,
         shard: &mut TenantShard,
         nodes: &Arc<HashMap<NodeId, Node>>,
+    ) -> Option<ReconcilerWaiter> {
+        self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::default())
+    }
+
+    /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`],
+    fn maybe_configured_reconcile_shard(
+        &self,
+        shard: &mut TenantShard,
+        nodes: &Arc<HashMap<NodeId, Node>>,
+        reconciler_config: ReconcilerConfig,
     ) -> Option<ReconcilerWaiter> {
         let reconcile_needed = shard.get_reconcile_needed(nodes);
 
@@ -5241,6 +5259,7 @@ impl Service {
             &self.result_tx,
             nodes,
             &self.compute_hook,
+            reconciler_config,
             &self.config,
             &self.persistence,
             units,
@@ -5715,18 +5734,92 @@ impl Service {
         self.gate.close().await;
     }
 
+    /// Spot check the download lag for a secondary location of a shard.
+    /// Should be used as a heuristic, since it's not always precise: the
+    /// secondary might have not downloaded the new heat map yet and, hence,
+    /// is not aware of the lag.
+    ///
+    /// Returns:
+    /// * Ok(None) if the lag could not be determined from the status,
+    /// * Ok(Some(_)) if the lag could be determind
+    /// * Err on failures to query the pageserver.
+    async fn secondary_lag(
+        &self,
+        secondary: &NodeId,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<Option<u64>, mgmt_api::Error> {
+        let nodes = self.inner.read().unwrap().nodes.clone();
+        let node = nodes.get(secondary).ok_or(mgmt_api::Error::ApiError(
+            StatusCode::NOT_FOUND,
+            format!("Node with id {} not found", secondary),
+        ))?;
+
+        match node
+            .with_client_retries(
+                |client| async move { client.tenant_secondary_status(tenant_shard_id).await },
+                &self.config.jwt_token,
+                1,
+                3,
+                Duration::from_millis(250),
+                &self.cancel,
+            )
+            .await
+        {
+            Some(Ok(status)) => match status.heatmap_mtime {
+                Some(_) => Ok(Some(status.bytes_total - status.bytes_downloaded)),
+                None => Ok(None),
+            },
+            Some(Err(e)) => Err(e),
+            None => Err(mgmt_api::Error::Cancelled),
+        }
+    }
+
     /// Drain a node by moving the shards attached to it as primaries.
     /// This is a long running operation and it should run as a separate Tokio task.
     pub(crate) async fn drain_node(
-        &self,
+        self: &Arc<Self>,
         node_id: NodeId,
         cancel: CancellationToken,
     ) -> Result<(), OperationError> {
-        let mut last_inspected_shard: Option<TenantShardId> = None;
-        let mut inspected_all_shards = false;
+        const MAX_SECONDARY_LAG_BYTES_DEFAULT: u64 = 256 * 1024 * 1024;
+        let max_secondary_lag_bytes = self
+            .config
+            .max_secondary_lag_bytes
+            .unwrap_or(MAX_SECONDARY_LAG_BYTES_DEFAULT);
+
+        // By default, live migrations are generous about the wait time for getting
+        // the secondary location up to speed. When draining, give up earlier in order
+        // to not stall the operation when a cold secondary is encountered.
+        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
+        const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
+        let reconciler_config = ReconcilerConfigBuilder::new()
+            .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
+            .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
+            .build();
+
         let mut waiters = Vec::new();
 
-        while !inspected_all_shards {
+        let mut tid_iter = TenantShardIterator::new({
+            let service = self.clone();
+            move |last_inspected_shard: Option<TenantShardId>| {
+                let locked = &service.inner.read().unwrap();
+                let tenants = &locked.tenants;
+                let entry = match last_inspected_shard {
+                    Some(skip_past) => {
+                        // Skip to the last seen tenant shard id
+                        let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past);
+
+                        // Skip past the last seen
+                        cursor.nth(1)
+                    }
+                    None => tenants.first_key_value(),
+                };
+
+                entry.map(|(tid, _)| tid).copied()
+            }
+        });
+
+        while !tid_iter.finished() {
             if cancel.is_cancelled() {
                 match self
                     .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
@@ -5745,71 +5838,82 @@ impl Service {
                 }
             }
 
-            {
-                let mut locked = self.inner.write().unwrap();
-                let (nodes, tenants, scheduler) = locked.parts_mut();
+            drain_utils::validate_node_state(&node_id, self.inner.read().unwrap().nodes.clone())?;
 
-                let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged(
-                    format!("node {node_id} was removed").into(),
-                ))?;
-
-                let current_policy = node.get_scheduling();
-                if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
-                    // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
-                    // about it
-                    return Err(OperationError::NodeStateChanged(
-                        format!("node {node_id} changed state to {current_policy:?}").into(),
-                    ));
-                }
-
-                let mut cursor = tenants.iter_mut().skip_while({
-                    let skip_past = last_inspected_shard;
-                    move |(tid, _)| match skip_past {
-                        Some(last) => **tid != last,
-                        None => false,
+            while waiters.len() < MAX_RECONCILES_PER_OPERATION {
+                let tid = match tid_iter.next() {
+                    Some(tid) => tid,
+                    None => {
+                        break;
                     }
-                });
+                };
 
-                while waiters.len() < MAX_RECONCILES_PER_OPERATION {
-                    let (tid, tenant_shard) = match cursor.next() {
-                        Some(some) => some,
+                let tid_drain = TenantShardDrain {
+                    drained_node: node_id,
+                    tenant_shard_id: tid,
+                };
+
+                let dest_node_id = {
+                    let locked = self.inner.read().unwrap();
+
+                    match tid_drain
+                        .tenant_shard_eligible_for_drain(&locked.tenants, &locked.scheduler)
+                    {
+                        Some(node_id) => node_id,
                         None => {
-                            inspected_all_shards = true;
-                            break;
+                            continue;
                         }
-                    };
+                    }
+                };
 
-                    // If the shard is not attached to the node being drained, skip it.
-                    if *tenant_shard.intent.get_attached() != Some(node_id) {
-                        last_inspected_shard = Some(*tid);
+                match self.secondary_lag(&dest_node_id, tid).await {
+                    Ok(Some(lag)) if lag <= max_secondary_lag_bytes => {
+                        // The secondary is reasonably up to date.
+                        // Migrate to it
+                    }
+                    Ok(Some(lag)) => {
+                        tracing::info!(
+                            tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                            "Secondary on node {dest_node_id} is lagging by {lag}. Skipping reconcile."
+                        );
                         continue;
                     }
+                    Ok(None) => {
+                        tracing::info!(
+                            tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                            "Could not determine lag for secondary on node {dest_node_id}. Skipping reconcile."
+                        );
+                        continue;
+                    }
+                    Err(err) => {
+                        tracing::warn!(
+                            tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                            "Failed to get secondary lag from node {dest_node_id}. Skipping reconcile: {err}"
+                        );
+                        continue;
+                    }
+                }
 
-                    match tenant_shard.reschedule_to_secondary(None, scheduler) {
-                        Err(e) => {
-                            tracing::warn!(
-                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                "Scheduling error when draining pageserver {} : {e}", node_id
-                            );
-                        }
-                        Ok(()) => {
-                            let scheduled_to = tenant_shard.intent.get_attached();
-                            tracing::info!(
-                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                "Rescheduled shard while draining node {}: {} -> {:?}",
-                                node_id,
-                                node_id,
-                                scheduled_to
-                            );
+                {
+                    let mut locked = self.inner.write().unwrap();
+                    let (nodes, tenants, scheduler) = locked.parts_mut();
+                    let rescheduled = tid_drain.reschedule_to_secondary(
+                        dest_node_id,
+                        tenants,
+                        scheduler,
+                        nodes,
+                    )?;
 
-                            let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
-                            if let Some(some) = waiter {
-                                waiters.push(some);
-                            }
+                    if let Some(tenant_shard) = rescheduled {
+                        let waiter = self.maybe_configured_reconcile_shard(
+                            tenant_shard,
+                            nodes,
+                            reconciler_config,
+                        );
+                        if let Some(some) = waiter {
+                            waiters.push(some);
                         }
                     }
-
-                    last_inspected_shard = Some(*tid);
                 }
             }
 
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index e250f29f98..1fcc3c8547 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -7,7 +7,7 @@ use std::{
 use crate::{
     metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
     persistence::TenantShardPersistence,
-    reconciler::ReconcileUnits,
+    reconciler::{ReconcileUnits, ReconcilerConfig},
     scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
     service::ReconcileResultRequest,
 };
@@ -1063,6 +1063,7 @@ impl TenantShard {
         result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResultRequest>,
         pageservers: &Arc<HashMap<NodeId, Node>>,
         compute_hook: &Arc<ComputeHook>,
+        reconciler_config: ReconcilerConfig,
         service_config: &service::Config,
         persistence: &Arc<Persistence>,
         units: ReconcileUnits,
@@ -1101,6 +1102,7 @@ impl TenantShard {
             generation: self.generation,
             intent: reconciler_intent,
             detach,
+            reconciler_config,
             config: self.config.clone(),
             observed: self.observed.clone(),
             compute_hook: compute_hook.clone(),
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c6f4404784..844a23d327 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -14,6 +14,7 @@ import textwrap
 import threading
 import time
 import uuid
+from collections import defaultdict
 from contextlib import closing, contextmanager
 from dataclasses import dataclass
 from datetime import datetime
@@ -2667,6 +2668,69 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"Got failpoints request response code {res.status_code}")
         res.raise_for_status()
 
+    def get_tenants_placement(self) -> defaultdict[str, Dict[str, Any]]:
+        """
+        Get the intent and observed placements of all tenants known to the storage controller.
+        """
+        tenants = self.tenant_list()
+
+        tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict(
+            lambda: {
+                "observed": {"attached": None, "secondary": []},
+                "intent": {"attached": None, "secondary": []},
+            }
+        )
+
+        for t in tenants:
+            for node_id, loc_state in t["observed"]["locations"].items():
+                if (
+                    loc_state is not None
+                    and "conf" in loc_state
+                    and loc_state["conf"] is not None
+                    and loc_state["conf"]["mode"]
+                    in set(["AttachedSingle", "AttachedMulti", "AttachedStale"])
+                ):
+                    tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id)
+
+                if (
+                    loc_state is not None
+                    and "conf" in loc_state
+                    and loc_state["conf"] is not None
+                    and loc_state["conf"]["mode"] == "Secondary"
+                ):
+                    tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(
+                        int(node_id)
+                    )
+
+            if "attached" in t["intent"]:
+                tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"][
+                    "attached"
+                ]
+
+            if "secondary" in t["intent"]:
+                tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][
+                    "secondary"
+                ]
+
+        return tenant_placement
+
+    def warm_up_all_secondaries(self):
+        log.info("Warming up all secondary locations")
+
+        tenant_placement = self.get_tenants_placement()
+        for tid, placement in tenant_placement.items():
+            assert placement["observed"]["attached"] is not None
+            primary_id = placement["observed"]["attached"]
+
+            assert len(placement["observed"]["secondary"]) == 1
+            secondary_id = placement["observed"]["secondary"][0]
+
+            parsed_tid = TenantShardId.parse(tid)
+            self.env.get_pageserver(primary_id).http_client().tenant_heatmap_upload(parsed_tid)
+            self.env.get_pageserver(secondary_id).http_client().tenant_secondary_download(
+                parsed_tid, wait_ms=250
+            )
+
     @property
     def workdir(self) -> Path:
         return self.env.repo_dir
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 65d6ff5d62..cd4261f1b8 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -361,6 +361,12 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         self.verbose_error(res)
         return (res.status_code, res.json())
 
+    def tenant_secondary_status(self, tenant_id: Union[TenantId, TenantShardId]):
+        url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/status"
+        res = self.get(url)
+        self.verbose_error(res)
+        return res.json()
+
     def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]):
         assert "tenant_id" not in config.keys()
         res = self.put(
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 04785f7184..297aedfbed 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -2,7 +2,6 @@ import concurrent.futures
 import random
 import time
 from collections import defaultdict
-from typing import Any, Dict
 
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
@@ -24,51 +23,14 @@ def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[
     This function takes into account the intersection of the intent and the observed state.
     If they do not match, it asserts out.
     """
-    tenants = env.storage_controller.tenant_list()
-
-    intent = dict()
-    observed = dict()
-
-    tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict(
-        lambda: {
-            "observed": {"attached": None, "secondary": []},
-            "intent": {"attached": None, "secondary": []},
-        }
-    )
-
-    for t in tenants:
-        for node_id, loc_state in t["observed"]["locations"].items():
-            if (
-                loc_state is not None
-                and "conf" in loc_state
-                and loc_state["conf"] is not None
-                and loc_state["conf"]["mode"]
-                in set(["AttachedSingle", "AttachedMulti", "AttachedStale"])
-            ):
-                observed[t["tenant_shard_id"]] = int(node_id)
-                tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id)
-
-            if (
-                loc_state is not None
-                and "conf" in loc_state
-                and loc_state["conf"] is not None
-                and loc_state["conf"]["mode"] == "Secondary"
-            ):
-                tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(int(node_id))
-
-        if "attached" in t["intent"]:
-            intent[t["tenant_shard_id"]] = t["intent"]["attached"]
-            tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"]["attached"]
-
-        if "secondary" in t["intent"]:
-            tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][
-                "secondary"
-            ]
-
+    tenant_placement = env.storage_controller.get_tenants_placement()
     log.info(f"{tenant_placement=}")
 
     matching = {
-        tid: intent[tid] for tid in observed if tid in intent and intent[tid] == observed[tid]
+        tid: tenant_placement[tid]["intent"]["attached"]
+        for tid in tenant_placement
+        if tenant_placement[tid]["intent"]["attached"]
+        == tenant_placement[tid]["observed"]["attached"]
     }
     assert len(matching) == total_shards
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index eb2cdccdb9..9b2557a165 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -17,6 +17,7 @@ from fixtures.neon_fixtures import (
     PgBin,
     StorageControllerApiException,
     TokenScope,
+    last_flush_lsn_upload,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -1597,6 +1598,8 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
 
     # Perform a graceful rolling restart
     for ps in env.pageservers:
+        env.storage_controller.warm_up_all_secondaries()
+
         env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
         )
@@ -1645,6 +1648,115 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
     assert_shard_counts_balanced(env, shard_counts, total_shards)
 
 
+def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    """
+    Artificially make a tenant shard's secondary location lag behind the primary
+    and check that storage controller driven node drains skip the lagging tenant shard.
+    Finally, validate that the tenant shard is migrated when a new drain request comes
+    in and it's no longer lagging.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.storage_controller_config = {
+        "max_secondary_lag_bytes": 1 * 1024 * 1024,
+    }
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tid, timeline_id = env.neon_cli.create_tenant(placement_policy='{"Attached":1}')
+
+    # Give things a chance to settle.
+    env.storage_controller.reconcile_until_idle(timeout_secs=30)
+
+    locations = env.storage_controller.locate(tid)
+    assert len(locations) == 1
+    primary: int = locations[0]["node_id"]
+    not_primary = [ps.id for ps in env.pageservers if ps.id != primary]
+    assert len(not_primary) == 1
+    secondary = not_primary[0]
+
+    log.info(f"Paused secondary downloads on {secondary}")
+    env.get_pageserver(secondary).http_client().configure_failpoints(
+        ("secondary-layer-download-pausable", "pause")
+    )
+
+    log.info(f"Ingesting some data for {tid}")
+
+    with env.endpoints.create_start("main", tenant_id=tid) as endpoint:
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        endpoint.safe_psql("CREATE TABLE created_foo(id integer);")
+        last_flush_lsn_upload(env, endpoint, tid, timeline_id)
+
+    log.info(f"Uploading heatmap from {primary} and requesting download from {secondary}")
+
+    env.get_pageserver(primary).http_client().tenant_heatmap_upload(tid)
+    env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100)
+
+    def secondary_is_lagging():
+        resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid)
+        lag = resp["bytes_total"] - resp["bytes_downloaded"]
+
+        if lag <= 1 * 1024 * 1024:
+            raise Exception(f"Secondary lag not big enough: {lag}")
+
+    log.info(f"Looking for lag to develop on the secondary {secondary}")
+    wait_until(10, 1, secondary_is_lagging)
+
+    log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}")
+    env.storage_controller.retryable_node_operation(
+        lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2
+    )
+
+    env.storage_controller.poll_node_status(
+        primary,
+        PageserverAvailability.ACTIVE,
+        PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
+        max_attempts=6,
+        backoff=5,
+    )
+
+    locations = env.storage_controller.locate(tid)
+    assert len(locations) == 1
+    assert locations[0]["node_id"] == primary
+
+    log.info(f"Unpausing secondary downloads on {secondary}")
+    env.get_pageserver(secondary).http_client().configure_failpoints(
+        ("secondary-layer-download-pausable", "off")
+    )
+    env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100)
+
+    log.info(f"Waiting for lag to reduce on {secondary}")
+
+    def lag_is_acceptable():
+        resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid)
+        lag = resp["bytes_total"] - resp["bytes_downloaded"]
+
+        if lag > 1 * 1024 * 1024:
+            raise Exception(f"Secondary lag not big enough: {lag}")
+
+    wait_until(10, 1, lag_is_acceptable)
+
+    env.storage_controller.node_configure(primary, {"scheduling": "Active"})
+
+    log.info(f"Starting drain of primary {primary} with non-laggy secondary {secondary}")
+
+    env.storage_controller.retryable_node_operation(
+        lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2
+    )
+
+    env.storage_controller.poll_node_status(
+        primary,
+        PageserverAvailability.ACTIVE,
+        PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
+        max_attempts=6,
+        backoff=5,
+    )
+
+    locations = env.storage_controller.locate(tid)
+    assert len(locations) == 1
+    assert locations[0]["node_id"] == secondary
+
+
 def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_configs()
@@ -1671,6 +1783,7 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
 
     ps_id_to_drain = env.pageservers[0].id
 
+    env.storage_controller.warm_up_all_secondaries()
     env.storage_controller.retryable_node_operation(
         lambda ps_id: env.storage_controller.node_drain(ps_id),
         ps_id_to_drain,

From 642aa1e160bd6c7c99ff8707a8b91b7309650eb0 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 9 Aug 2024 15:48:16 +0100
Subject: [PATCH 45/48] Dockerfiles: remove cachepot (#8666)

## Problem
We install and try to use `cachepot`. But it is not configured correctly
and doesn't work (after https://github.com/neondatabase/neon/pull/2290)

## Summary of changes
- Remove `cachepot`
---
 .github/workflows/neon_extra_builds.yml |  2 --
 Dockerfile                              | 15 +--------------
 Dockerfile.build-tools                  |  2 --
 libs/utils/src/lib.rs                   |  2 +-
 4 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index d4870e16ad..2ee66cfdc1 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -149,8 +149,6 @@ jobs:
 
     env:
       BUILD_TYPE: release
-      # remove the cachepot wrapper and build without crate caches
-      RUSTC_WRAPPER: ""
       # build with incremental compilation produce partial results
       # so do not attempt to cache this build, also disable the incremental compilation
       CARGO_INCREMENTAL: 0
diff --git a/Dockerfile b/Dockerfile
index 6ed57a84a3..ceb1c7cb55 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,24 +29,12 @@ WORKDIR /home/nonroot
 ARG GIT_VERSION=local
 ARG BUILD_TAG
 
-# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
-# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
-# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build
-ARG RUSTC_WRAPPER=cachepot
-ENV AWS_REGION=eu-central-1
-ENV CACHEPOT_S3_KEY_PREFIX=cachepot
-ARG CACHEPOT_BUCKET=neon-github-dev
-#ARG AWS_ACCESS_KEY_ID
-#ARG AWS_SECRET_ACCESS_KEY
-
 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --chown=nonroot . .
 
-# Show build caching stats to check if it was used in the end.
-# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
     && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
       --bin pg_sni_router  \
@@ -58,8 +46,7 @@ RUN set -e \
       --bin proxy  \
       --bin neon_local \
       --bin storage_scrubber \
-      --locked --release \
-    && cachepot -s
+      --locked --release
 
 # Build final image
 #
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index d39d36e1b6..d6beb61369 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -208,7 +208,6 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     . "$HOME/.cargo/env" && \
     cargo --version && rustup --version && \
     rustup component add llvm-tools-preview rustfmt clippy && \
-    cargo install --git https://github.com/paritytech/cachepot && \
     cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
     cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
     cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
@@ -216,7 +215,6 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
     rm -rf /home/nonroot/.cargo/registry && \
     rm -rf /home/nonroot/.cargo/git
-ENV RUSTC_WRAPPER=cachepot
 
 # Show versions
 RUN whoami \
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index a46d68ef33..f4fc0ba57b 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -128,7 +128,7 @@ pub mod circuit_breaker;
 ///
 /// #############################################################################################
 /// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
-/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
+/// We used `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
 /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
 /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
 /// The problem needs further investigation and regular `const` declaration instead of a macro.

From 66f86f184bff8e280bc7ae53fd8b94b843f2766e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 9 Aug 2024 18:30:15 +0100
Subject: [PATCH 46/48] Update docs/SUMMARY.md (#8665)

## Problem

This page had many dead links, and was confusing for folks looking for
documentation about our product.

Closes: https://github.com/neondatabase/neon/issues/8535

## Summary of changes

- Add a link to the product docs up top
- Remove dead/placeholder links
---
 docs/SUMMARY.md | 59 ++++++++++---------------------------------------
 1 file changed, 12 insertions(+), 47 deletions(-)

diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
index b275349168..5fd4080c28 100644
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -1,13 +1,18 @@
 # Summary
 
+# Looking for `neon.tech` docs?
+
+This page linkes to a selection of technical content about the open source code in this repository.
+
+Please visit https://neon.tech/docs for documentation about using the Neon service, which is based on the code
+in this repository.
+
+# Architecture
+
 [Introduction]()
 - [Separation of Compute and Storage](./separation-compute-storage.md)
 
-# Architecture
-
 - [Compute]()
-  - [WAL proposer]()
-  - [WAL Backpressure]()
   - [Postgres changes](./core_changes.md)
 
 - [Pageserver](./pageserver.md)
@@ -16,33 +21,15 @@
     - [WAL Redo](./pageserver-walredo.md)
     - [Page cache](./pageserver-pagecache.md)
     - [Storage](./pageserver-storage.md)
-        - [Datadir mapping]()
-        - [Layer files]()
-        - [Branching]()
-        - [Garbage collection]()
-    - [Cloud Storage]()
     - [Processing a GetPage request](./pageserver-processing-getpage.md)
     - [Processing WAL](./pageserver-processing-wal.md)
-	- [Management API]()
-	- [Tenant Rebalancing]()
 
 - [WAL Service](walservice.md)
   - [Consensus protocol](safekeeper-protocol.md)
-  - [Management API]()
-  - [Rebalancing]()
-
-- [Control Plane]()
-
-- [Proxy]()
 
 - [Source view](./sourcetree.md)
   - [docker.md](./docker.md) — Docker images and building pipeline.
   - [Error handling and logging](./error-handling.md)
-  - [Testing]()
-    - [Unit testing]()
-    - [Integration testing]()
-    - [Benchmarks]()
-
 
 - [Glossary](./glossary.md)
 
@@ -58,28 +45,6 @@
 
 # RFCs
 
-- [RFCs](./rfcs/README.md)
-
-- [002-storage](rfcs/002-storage.md)
-- [003-laptop-cli](rfcs/003-laptop-cli.md)
-- [004-durability](rfcs/004-durability.md)
-- [005-zenith_local](rfcs/005-zenith_local.md)
-- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md)
-- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md)
-- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md)
-- [008-push-pull](rfcs/008-push-pull.md)
-- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md)
-- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md)
-- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md)
-- [010-storage_details](rfcs/010-storage_details.md)
-- [011-retention-policy](rfcs/011-retention-policy.md)
-- [012-background-tasks](rfcs/012-background-tasks.md)
-- [013-term-history](rfcs/013-term-history.md)
-- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md)
-- [014-storage-lsm](rfcs/014-storage-lsm.md)
-- [015-storage-messaging](rfcs/015-storage-messaging.md)
-- [016-connection-routing](rfcs/016-connection-routing.md)
-- [017-timeline-data-management](rfcs/017-timeline-data-management.md)
-- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md)
-- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md)
-- [cluster-size-limits](rfcs/cluster-size-limits.md)
+Major changes are documented in RFCS:
+- See [RFCs](./rfcs/README.md) for more information
+- view the RFCs at https://github.com/neondatabase/neon/tree/main/docs/rfcs

From c7c58eeab821c762dd09325a8556985e585359bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 10 Aug 2024 14:04:47 +0200
Subject: [PATCH 47/48] Also pass HOME env var in access_env_vars (#8685)

Noticed this while debugging a test failure in #8673 which only occurs
with real S3 instead of mock S3: if you authenticate to S3 via
`AWS_PROFILE`, then it requires the `HOME` env var to be set so that it
can read inside the `~/.aws` directory.

The scrubber abstraction `StorageScrubber::scrubber_cli` in
`neon_fixtures.py` would otherwise not work. My earlier PR #6556 has
done similar things for the `neon_local` wrapper.

You can try:

```
aws sso login --profile dev
export ENABLE_REAL_S3_REMOTE_STORAGE=y REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests REMOTE_STORAGE_S3_REGION=eu-central-1 AWS_PROFILE=dev
RUST_BACKTRACE=1 BUILD_TYPE=debug DEFAULT_PG_VERSION=16 ./scripts/pytest -vv --tb=short -k test_scrubber_tenant_snapshot
```

before and after this patch: this patch fixes it.
---
 test_runner/fixtures/remote_storage.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 0f2a997b1e..1b6c3c23ba 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -177,9 +177,14 @@ class S3Storage:
 
     def access_env_vars(self) -> Dict[str, str]:
         if self.aws_profile is not None:
-            return {
+            env = {
                 "AWS_PROFILE": self.aws_profile,
             }
+            # Pass through HOME env var because AWS_PROFILE needs it in order to work
+            home = os.getenv("HOME")
+            if home is not None:
+                env["HOME"] = home
+            return env
         if self.access_key is not None and self.secret_key is not None:
             return {
                 "AWS_ACCESS_KEY_ID": self.access_key,

From d69f79c7eb930b8ffd368d7ada479e696f204871 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 11 Aug 2024 12:21:32 +0100
Subject: [PATCH 48/48] chore(deps): bump aiohttp from 3.9.4 to 3.10.2 (#8684)

---
 poetry.lock    | 170 ++++++++++++++++++++++++++-----------------------
 pyproject.toml |   2 +-
 2 files changed, 92 insertions(+), 80 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 9026824558..7db91e51f7 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,91 +1,103 @@
 # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.3.5"
+description = "Happy Eyeballs for asyncio"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"},
+    {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"},
+]
+
 [[package]]
 name = "aiohttp"
-version = "3.9.4"
+version = "3.10.2"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"},
-    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"},
-    {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"},
-    {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"},
-    {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"},
-    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"},
-    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"},
-    {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"},
-    {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"},
-    {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"},
-    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"},
-    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"},
-    {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"},
-    {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"},
-    {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"},
-    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"},
-    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"},
-    {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"},
-    {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"},
-    {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"},
-    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"},
-    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"},
-    {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"},
-    {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"},
-    {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"},
-    {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"},
+    {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:95213b3d79c7e387144e9cb7b9d2809092d6ff2c044cb59033aedc612f38fb6d"},
+    {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1aa005f060aff7124cfadaa2493f00a4e28ed41b232add5869e129a2e395935a"},
+    {file = "aiohttp-3.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eabe6bf4c199687592f5de4ccd383945f485779c7ffb62a9b9f1f8a3f9756df8"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e010736fc16d21125c7e2dc5c350cd43c528b85085c04bf73a77be328fe944"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99f81f9c1529fd8e03be4a7bd7df32d14b4f856e90ef6e9cbad3415dbfa9166c"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d611d1a01c25277bcdea06879afbc11472e33ce842322496b211319aa95441bb"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00191d38156e09e8c81ef3d75c0d70d4f209b8381e71622165f22ef7da6f101"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74c091a5ded6cb81785de2d7a8ab703731f26de910dbe0f3934eabef4ae417cc"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:18186a80ec5a701816adbf1d779926e1069392cf18504528d6e52e14b5920525"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5a7ceb2a0d2280f23a02c64cd0afdc922079bb950400c3dd13a1ab2988428aac"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8bd7be6ff6c162a60cb8fce65ee879a684fbb63d5466aba3fa5b9288eb04aefa"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:fae962b62944eaebff4f4fddcf1a69de919e7b967136a318533d82d93c3c6bd1"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0fde16d284efcacbe15fb0c1013f0967b6c3e379649239d783868230bf1db42"},
+    {file = "aiohttp-3.10.2-cp310-cp310-win32.whl", hash = "sha256:f81cd85a0e76ec7b8e2b6636fe02952d35befda4196b8c88f3cec5b4fb512839"},
+    {file = "aiohttp-3.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:54ba10eb5a3481c28282eb6afb5f709aedf53cf9c3a31875ffbdc9fc719ffd67"},
+    {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87fab7f948e407444c2f57088286e00e2ed0003ceaf3d8f8cc0f60544ba61d91"},
+    {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ec6ad66ed660d46503243cbec7b2b3d8ddfa020f984209b3b8ef7d98ce69c3f2"},
+    {file = "aiohttp-3.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4be88807283bd96ae7b8e401abde4ca0bab597ba73b5e9a2d98f36d451e9aac"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01c98041f90927c2cbd72c22a164bb816fa3010a047d264969cf82e1d4bcf8d1"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54e36c67e1a9273ecafab18d6693da0fb5ac48fd48417e4548ac24a918c20998"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7de3ddb6f424af54535424082a1b5d1ae8caf8256ebd445be68c31c662354720"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dd9c7db94b4692b827ce51dcee597d61a0e4f4661162424faf65106775b40e7"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e57e21e1167705f8482ca29cc5d02702208d8bf4aff58f766d94bcd6ead838cd"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a1a50e59b720060c29e2951fd9f13c01e1ea9492e5a527b92cfe04dd64453c16"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:686c87782481fda5ee6ba572d912a5c26d9f98cc5c243ebd03f95222af3f1b0f"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:dafb4abb257c0ed56dc36f4e928a7341b34b1379bd87e5a15ce5d883c2c90574"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:494a6f77560e02bd7d1ab579fdf8192390567fc96a603f21370f6e63690b7f3d"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6fe8503b1b917508cc68bf44dae28823ac05e9f091021e0c41f806ebbb23f92f"},
+    {file = "aiohttp-3.10.2-cp311-cp311-win32.whl", hash = "sha256:4ddb43d06ce786221c0dfd3c91b4892c318eaa36b903f7c4278e7e2fa0dd5102"},
+    {file = "aiohttp-3.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:ca2f5abcb0a9a47e56bac173c01e9f6c6e7f27534d91451c5f22e6a35a5a2093"},
+    {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:14eb6b17f6246959fb0b035d4f4ae52caa870c4edfb6170aad14c0de5bfbf478"},
+    {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:465e445ec348d4e4bd349edd8b22db75f025da9d7b6dc1369c48e7935b85581e"},
+    {file = "aiohttp-3.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:341f8ece0276a828d95b70cd265d20e257f5132b46bf77d759d7f4e0443f2906"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c01fbb87b5426381cd9418b3ddcf4fc107e296fa2d3446c18ce6c76642f340a3"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c474af073e1a6763e1c5522bbb2d85ff8318197e4c6c919b8d7886e16213345"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d9076810a5621236e29b2204e67a68e1fe317c8727ee4c9abbfbb1083b442c38"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8f515d6859e673940e08de3922b9c4a2249653b0ac181169313bd6e4b1978ac"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:655e583afc639bef06f3b2446972c1726007a21003cd0ef57116a123e44601bc"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8da9449a575133828cc99985536552ea2dcd690e848f9d41b48d8853a149a959"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19073d57d0feb1865d12361e2a1f5a49cb764bf81a4024a3b608ab521568093a"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c8e98e1845805f184d91fda6f9ab93d7c7b0dddf1c07e0255924bfdb151a8d05"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:377220a5efde6f9497c5b74649b8c261d3cce8a84cb661be2ed8099a2196400a"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92f7f4a4dc9cdb5980973a74d43cdbb16286dacf8d1896b6c3023b8ba8436f8e"},
+    {file = "aiohttp-3.10.2-cp312-cp312-win32.whl", hash = "sha256:9bb2834a6f11d65374ce97d366d6311a9155ef92c4f0cee543b2155d06dc921f"},
+    {file = "aiohttp-3.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:518dc3cb37365255708283d1c1c54485bbacccd84f0a0fb87ed8917ba45eda5b"},
+    {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7f98e70bbbf693086efe4b86d381efad8edac040b8ad02821453083d15ec315f"},
+    {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f6f0b252a009e98fe84028a4ec48396a948e7a65b8be06ccfc6ef68cf1f614d"},
+    {file = "aiohttp-3.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9360e3ffc7b23565600e729e8c639c3c50d5520e05fdf94aa2bd859eef12c407"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3988044d1635c7821dd44f0edfbe47e9875427464e59d548aece447f8c22800a"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a9d59da1543a6f1478c3436fd49ec59be3868bca561a33778b4391005e499d"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f49bdb94809ac56e09a310a62f33e5f22973d6fd351aac72a39cd551e98194"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfd2dca3f11c365d6857a07e7d12985afc59798458a2fdb2ffa4a0332a3fd43"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c1508ec97b2cd3e120bfe309a4ff8e852e8a7460f1ef1de00c2c0ed01e33c"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:49904f38667c44c041a0b44c474b3ae36948d16a0398a8f8cd84e2bb3c42a069"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:352f3a4e5f11f3241a49b6a48bc5b935fabc35d1165fa0d87f3ca99c1fcca98b"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:fc61f39b534c5d5903490478a0dd349df397d2284a939aa3cbaa2fb7a19b8397"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:ad2274e707be37420d0b6c3d26a8115295fe9d8e6e530fa6a42487a8ca3ad052"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c836bf3c7512100219fe1123743fd8dd9a2b50dd7cfb0c3bb10d041309acab4b"},
+    {file = "aiohttp-3.10.2-cp38-cp38-win32.whl", hash = "sha256:53e8898adda402be03ff164b0878abe2d884e3ea03a4701e6ad55399d84b92dc"},
+    {file = "aiohttp-3.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:7cc8f65f5b22304693de05a245b6736b14cb5bc9c8a03da6e2ae9ef15f8b458f"},
+    {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9dfc906d656e14004c5bc672399c1cccc10db38df2b62a13fb2b6e165a81c316"},
+    {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:91b10208b222ddf655c3a3d5b727879d7163db12b634492df41a9182a76edaae"},
+    {file = "aiohttp-3.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fd16b5e1a7bdd14668cd6bde60a2a29b49147a535c74f50d8177d11b38433a7"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2bfdda4971bd79201f59adbad24ec2728875237e1c83bba5221284dbbf57bda"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69d73f869cf29e8a373127fc378014e2b17bcfbe8d89134bc6fb06a2f67f3cb3"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df59f8486507c421c0620a2c3dce81fbf1d54018dc20ff4fecdb2c106d6e6abc"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df930015db36b460aa9badbf35eccbc383f00d52d4b6f3de2ccb57d064a6ade"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:562b1153ab7f766ee6b8b357ec777a302770ad017cf18505d34f1c088fccc448"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d984db6d855de58e0fde1ef908d48fe9a634cadb3cf715962722b4da1c40619d"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:14dc3fcb0d877911d775d511eb617a486a8c48afca0a887276e63db04d3ee920"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b52a27a5c97275e254704e1049f4b96a81e67d6205f52fa37a4777d55b0e98ef"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:cd33d9de8cfd006a0d0fe85f49b4183c57e91d18ffb7e9004ce855e81928f704"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1238fc979160bc03a92fff9ad021375ff1c8799c6aacb0d8ea1b357ea40932bb"},
+    {file = "aiohttp-3.10.2-cp39-cp39-win32.whl", hash = "sha256:e2f43d238eae4f0b04f58d4c0df4615697d4ca3e9f9b1963d49555a94f0f5a04"},
+    {file = "aiohttp-3.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:947847f07a8f81d7b39b2d0202fd73e61962ebe17ac2d8566f260679e467da7b"},
+    {file = "aiohttp-3.10.2.tar.gz", hash = "sha256:4d1f694b5d6e459352e5e925a42e05bac66655bfde44d81c59992463d2897014"},
 ]
 
 [package.dependencies]
+aiohappyeyeballs = ">=2.3.0"
 aiosignal = ">=1.1.2"
 async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
 attrs = ">=17.3.0"
@@ -94,7 +106,7 @@ multidict = ">=4.5,<7.0"
 yarl = ">=1.0,<2.0"
 
 [package.extras]
-speedups = ["Brotli", "aiodns", "brotlicffi"]
+speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
 
 [[package]]
 name = "aiopg"
@@ -3371,4 +3383,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72"
+content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055"
diff --git a/pyproject.toml b/pyproject.toml
index cfb569b2ba..ad3961ef55 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
-aiohttp = "3.9.4"
+aiohttp = "3.10.2"
 pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"