From 6f2333f52bfa1bce0039f9d013a885d5a76cf5ff Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 25 Sep 2024 19:07:20 +0300
Subject: [PATCH 01/29] CI: Leave out unnecessary build files from binary
 artifact (#9135)

The pg_install/build directory contains .o files and such intermediate
results from the build, which are not needed in the final tarball.
Except for src/test/regress/regress.so and a few other .so files in that
directory; keep those.

This reduces the size of the neon-Linux-X64-release-artifact.tar.zst
artifact from about 1.5 GB to 700 MB.

(I attempted this a long time ago already, by moving the build/
directory out of pg_install altogether, see PR #2127. But I never got
around to finish that work.)

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/_build-and-test-locally.yml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 67152b6991..5ea911eb95 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -257,7 +257,15 @@ jobs:
           ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
 
       - name: Install postgres binaries
-        run: cp -a pg_install /tmp/neon/pg_install
+        run: |
+          # Use tar to copy files matching the pattern, preserving the paths in the destionation
+          tar c \
+            pg_install/v* \
+            pg_install/build/*/src/test/regress/*.so \
+            pg_install/build/*/src/test/regress/pg_regress \
+            pg_install/build/*/src/test/isolation/isolationtester \
+            pg_install/build/*/src/test/isolation/pg_isolation_regress \
+            | tar  x -C /tmp/neon
 
       - name: Upload Neon artifact
         uses: ./.github/actions/upload

From 04f32b9526c83cc5e38ba5505bb310f5430c7402 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 25 Sep 2024 17:22:32 +0100
Subject: [PATCH 02/29] tests: remove patching up of az id column (#8968)

This was required since the compat tests used a snapshot generated from
a version of neon local which didn't contain the availability_zone_id
column.
---
 test_runner/fixtures/neon_fixtures.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 201eb1087d..70fe632f49 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -642,9 +642,6 @@ class NeonEnvBuilder:
         patch_script = ""
         for ps in self.env.pageservers:
             patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg}  WHERE node_id = '{ps.id}';"
-            # This is a temporary to get the backward compat test happy
-            # since the compat snapshot was generated with an older version of neon local
-            patch_script += f"UPDATE nodes SET availability_zone_id='{ps.az_id}'  WHERE node_id = '{ps.id}' AND availability_zone_id IS NULL;"
         patch_script_path.write_text(patch_script)
 
         # Update the config with info about tenants and timelines

From c6e89445e269d82296340cca55635fd01c3971eb Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 25 Sep 2024 18:22:39 +0100
Subject: [PATCH 03/29] CI(promote-images): fix prod ECR auth (#9146)

A cherry-pick from the previous release (#9131)

## Problem
Login to prod ECR doesn't work anymore:
```
Retrieving registries data through *** SDK...
*** ECR detected with eu-central-1 region
Error: The security token included in the request is invalid.
```

## Summary of changes
- Fix login to prod ECR by using `aws-actions/configure-aws-credentials`
---
 .github/workflows/build_and_test.yml | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9dcc9709eb..81a9fd99ae 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -862,6 +862,9 @@ jobs:
     needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
     runs-on: ubuntu-22.04
 
+    permissions:
+      id-token: write # for `aws-actions/configure-aws-credentials`
+
     env:
       VERSIONS: v14 v15 v16 v17
 
@@ -906,13 +909,19 @@ jobs:
           docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
                                               neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
 
+      - name: Configure AWS-prod credentials
+        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: eu-central-1
+          mask-aws-account-id: true
+          role-to-assume: ${{ secrets.PROD_GHA_OIDC_ROLE }}
+
       - name: Login to prod ECR
         uses: docker/login-action@v3
         if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
         with:
           registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_ACCESS_KEY_ID }}
-          password: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_SECRET_ACCESS_KEY }}
 
       - name: Copy all images to prod ECR
         if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'

From 6a4f49b08bf8e96ad62fd9ac1550acf3e58d4179 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 25 Sep 2024 16:35:33 -0400
Subject: [PATCH 04/29] fix(pageserver): passthrough partition cancel error
 (#9154)

close https://github.com/neondatabase/neon/issues/9142

## Summary of changes

passthrough CollectKeyspaceError::Cancelled to
CompactionError::ShuttingDown

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs            | 18 ++++++++++++++----
 pageserver/src/tenant/timeline/compaction.rs |  2 +-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d301ba23ea..157c6ab91e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3601,7 +3601,7 @@ impl Timeline {
                     ctx,
                 )
                 .await
-                .map_err(|e| FlushLayerError::from_anyhow(self, e))?;
+                .map_err(|e| FlushLayerError::from_anyhow(self, e.into()))?;
 
             if self.cancel.is_cancelled() {
                 return Err(FlushLayerError::Cancelled);
@@ -3840,16 +3840,20 @@ impl Timeline {
         partition_size: u64,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> {
+    ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
         let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
             // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
             // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
             // and hence before the compaction task starts.
-            anyhow::bail!("repartition() called concurrently, this should not happen");
+            return Err(CompactionError::Other(anyhow!(
+                "repartition() called concurrently, this should not happen"
+            )));
         };
         let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
         if lsn < *partition_lsn {
-            anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
+            return Err(CompactionError::Other(anyhow!(
+                "repartition() called with LSN going backwards, this should not happen"
+            )));
         }
 
         let distance = lsn.0 - partition_lsn.0;
@@ -4451,6 +4455,12 @@ pub(crate) enum CompactionError {
     Other(anyhow::Error),
 }
 
+impl CompactionError {
+    pub fn is_cancelled(&self) -> bool {
+        matches!(self, CompactionError::ShuttingDown)
+    }
+}
+
 impl From<CollectKeySpaceError> for CompactionError {
     fn from(err: CollectKeySpaceError) -> Self {
         match err {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 6edc28a11b..3de386a2d5 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -390,7 +390,7 @@ impl Timeline {
                 // error but continue.
                 //
                 // Suppress error when it's due to cancellation
-                if !self.cancel.is_cancelled() {
+                if !self.cancel.is_cancelled() && !err.is_cancelled() {
                     tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
                 }
                 (1, false)

From 8ace9ea25ff59c2313d88934655f5c0e9edffa42 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 25 Sep 2024 12:45:05 -0500
Subject: [PATCH 05/29] Format long single DATA line in pgxn/Makefile

This should be a little more readable.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/Makefile | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index ddc8155ff3..f1229b2d73 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -25,7 +25,18 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql  neon--1.3--1.4.sql neon--1.4--1.3.sql neon--1.4--1.5.sql neon--1.5--1.4.sql
+DATA = \
+	neon--1.0.sql \
+	neon--1.0--1.1.sql \
+	neon--1.1--1.2.sql \
+	neon--1.2--1.3.sql \
+	neon--1.3--1.4.sql \
+	neon--1.4--1.5.sql \
+	neon--1.5--1.4.sql \
+	neon--1.4--1.3.sql \
+	neon--1.3--1.2.sql \
+	neon--1.2--1.1.sql \
+	neon--1.1--1.0.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 EXTRA_CLEAN = \

From 684e924211a6a0cefe38b38ad1a5509ea01235a8 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 25 Sep 2024 13:18:34 -0500
Subject: [PATCH 06/29] Fix compute_logical_snapshot_files for v14

The function, pg_ls_logicalsnapdir(), was added in version 15.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/etc/neon_collector.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/compute/etc/neon_collector.yml b/compute/etc/neon_collector.yml
index 29be0958dd..acb17d3cc0 100644
--- a/compute/etc/neon_collector.yml
+++ b/compute/etc/neon_collector.yml
@@ -195,7 +195,7 @@ metrics:
       -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
       -- temporary snapshot files are renamed to the actual snapshot files after they are
       -- completely built. We only WAL-log the completely built snapshot files.
-      (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
+      (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
 
 # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
 # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
@@ -244,4 +244,3 @@ metrics:
     SELECT slot_name,
            CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
     FROM pg_replication_slots;
-

From 7e560dd00e7c63679aad50322f0b60d8ad32cfb0 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 26 Sep 2024 00:29:16 +0300
Subject: [PATCH 07/29] chore: Silence clippy warning with nightly (#9157)

The warning:

    warning: first doc comment paragraph is too long
      --> pageserver/src/tenant/checks.rs:7:1
       |
7 | / /// Checks whether a layer map is valid (i.e., is a valid result
of the current compaction algorithm if no...
8 | | /// The function checks if we can split the LSN range of a delta
layer only at the LSNs of the delta layer...
    9  | | ///
    10 | | /// ```plain
       | |_
       |
= help: for further information visit
https://rust-lang.github.io/rust-clippy/master/index.html#too_long_first_doc_paragraph
= note: `#[warn(clippy::too_long_first_doc_paragraph)]` on by default
    help: add an empty line
       |
7 ~ /// Checks whether a layer map is valid (i.e., is a valid result of
the current compaction algorithm if nothing goes wrong).
    8  + ///
       |

Fix by applying the suggestion.
---
 pageserver/src/tenant/checks.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs
index 8eaa8a001c..1e8fa8d1d6 100644
--- a/pageserver/src/tenant/checks.rs
+++ b/pageserver/src/tenant/checks.rs
@@ -5,6 +5,7 @@ use itertools::Itertools;
 use super::storage_layer::LayerName;
 
 /// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
+///
 /// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
 ///
 /// ```plain

From 7bae78186bac794bf46372ef8eeb2928d54ce6df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 26 Sep 2024 02:05:25 +0200
Subject: [PATCH 08/29] Forbid creation of child timelines of archived timeline
 (#9122)

We don't want to allow any new child timelines of archived timelines. If
you want any new child timelines, you should first un-archive the
timeline.

Part of #8088
---
 pageserver/src/http/routes.rs | 4 ++++
 pageserver/src/tenant.rs      | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6a10d4fb1c..ba38120bf1 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -589,6 +589,10 @@ async fn timeline_create_handler(
                 StatusCode::SERVICE_UNAVAILABLE,
                 HttpErrorBody::from_msg(e.to_string()),
             ),
+            Err(e @ tenant::CreateTimelineError::AncestorArchived) => json_response(
+                StatusCode::NOT_ACCEPTABLE,
+                HttpErrorBody::from_msg(e.to_string()),
+            ),
             Err(tenant::CreateTimelineError::ShuttingDown) => json_response(
                 StatusCode::SERVICE_UNAVAILABLE,
                 HttpErrorBody::from_msg("tenant shutting down".to_string()),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 53cbaea621..2aebf4f999 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -563,6 +563,8 @@ pub enum CreateTimelineError {
     AncestorLsn(anyhow::Error),
     #[error("ancestor timeline is not active")]
     AncestorNotActive,
+    #[error("ancestor timeline is archived")]
+    AncestorArchived,
     #[error("tenant shutting down")]
     ShuttingDown,
     #[error(transparent)]
@@ -1698,6 +1700,11 @@ impl Tenant {
                     return Err(CreateTimelineError::AncestorNotActive);
                 }
 
+                if ancestor_timeline.is_archived() == Some(true) {
+                    info!("tried to branch archived timeline");
+                    return Err(CreateTimelineError::AncestorArchived);
+                }
+
                 if let Some(lsn) = ancestor_start_lsn.as_mut() {
                     *lsn = lsn.align();
 

From 7fdf1ab5b619f97b54c06303b659d980c5530e44 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 26 Sep 2024 15:17:01 +0100
Subject: [PATCH 09/29] CI: run compatibility tests on Postgres 17 (#9145)

## Problem

The latest storage release has generated artifacts for Postgres 17,
so we can enable compatibility tests this version

## Summary of changes
- Unskip `test_backward_compatibility` / `test_forward_compatibility` on
Postgres 17
---
 .github/workflows/build_and_test.yml      | 5 ++---
 test_runner/regress/test_compatibility.py | 8 +-------
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 81a9fd99ae..ba5d139553 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1190,10 +1190,9 @@ jobs:
 
               files_to_promote+=("s3://${BUCKET}/${s3_key}")
 
-              # TODO Add v17
-              for pg_version in v14 v15 v16; do
+              for pg_version in v14 v15 v16 v17; do
                 # We run less tests for debug builds, so we don't need to promote them
-                if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
+                if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v17" ] ; }; then
                   continue
                 fi
 
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index fb5c1d3115..b559be5f18 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -21,7 +21,7 @@ from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
 )
-from fixtures.pg_version import PgVersion, skip_on_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
 from fixtures.workload import Workload
 
@@ -156,9 +156,6 @@ ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
-@skip_on_postgres(
-    PgVersion.V17, "There are no snapshots yet"
-)  # TODO: revert this once we have snapshots
 def test_backward_compatibility(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
@@ -206,9 +203,6 @@ def test_backward_compatibility(
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
-@skip_on_postgres(
-    PgVersion.V17, "There are no snapshots yet"
-)  # TODO: revert this once we have snapshots
 def test_forward_compatibility(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,

From 80e974d05b7689f98ca0e83496895e821840e263 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 26 Sep 2024 15:42:17 +0100
Subject: [PATCH 10/29] fix(compute_ctl): race condition in configurator
 (#9162)

There was a tricky race condition in compute_ctl, that sometimes makes
configurator skip updates. It makes a deadlock because:
- control-plane cannot configure compute, because it's in
ConfigurationPending state
- compute_ctl doesn't do any reconfiguration because
`configurator_main_loop` missed notification for it

Full sequence that reproduces the issue:
1. `start_compute` finishes works and changes status
`self.set_status(ComputeStatus::Running);`
2. configurator received update about `Running` state and dropped the
mutex lock in the iteration
3. `/configure` request was triggered at the same time as step 1, and
got the mutex lock
4. same `/configure` request set the spec and updated the state to
`ConfigurationPending`, also sent a notification
5. next iteration in configurator got the mutex lock, but missed the
notification

There are more details in this slack thread:
https://neondb.slack.com/archives/C03438W3FLZ/p1727281028478689?thread_ts=1727261220.483799&cid=C03438W3FLZ

---------

Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
---
 compute_tools/src/configurator.rs | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs
index 274a221ac7..7bd0e4938d 100644
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -11,9 +11,17 @@ use crate::compute::ComputeNode;
 fn configurator_main_loop(compute: &Arc<ComputeNode>) {
     info!("waiting for reconfiguration requests");
     loop {
-        let state = compute.state.lock().unwrap();
-        let mut state = compute.state_changed.wait(state).unwrap();
+        let mut state = compute.state.lock().unwrap();
 
+        // We have to re-check the status after re-acquiring the lock because it could be that
+        // the status has changed while we were waiting for the lock, and we might not need to
+        // wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e.
+        // we are waiting for a condition variable that will never be signaled.
+        if state.status != ComputeStatus::ConfigurationPending {
+            state = compute.state_changed.wait(state).unwrap();
+        }
+
+        // Re-check the status after waking up
         if state.status == ComputeStatus::ConfigurationPending {
             info!("got configuration request");
             state.status = ComputeStatus::Configuration;

From 3d255d601b576961b1f564ed70011fdf64fbf0b2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 26 Sep 2024 16:06:34 +0100
Subject: [PATCH 11/29] pageserver: rename control plane client & chunk
 validation requests (#8997)

## Problem

- In https://github.com/neondatabase/neon/pull/8784, the validate
controller API is modified to check generations directly in the
database. It batches tenants into separate queries to avoid generating a
huge statement, but
- While updating this, I realized that "control_plane_client" is a kind
of confusing name for the client code now that it primarily talks to the
storage controller (the case of talking to the control plane will go
away in a few months).

## Summary of changes

- Big rename to "ControllerUpcallClient" -- this reflects the storage
controller's api naming, where the paths used by the pageserver are in
`/upcall/`
- When sending validate requests, break them up into chunks so that we
avoid possible edge cases of generating any HTTP requests that require
database I/O across many thousands of tenants.

This PR mixes a functional change with a refactor, but the commits are
cleanly separated -- only the last commit is a functional change.

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/bin/pageserver.rs              |  4 +-
 ..._client.rs => controller_upcall_client.rs} | 64 +++++++++++--------
 pageserver/src/deletion_queue.rs              |  8 +--
 pageserver/src/deletion_queue/validator.rs    | 14 ++--
 pageserver/src/lib.rs                         |  2 +-
 pageserver/src/tenant/mgr.rs                  |  8 +--
 6 files changed, 56 insertions(+), 44 deletions(-)
 rename pageserver/src/{control_plane_client.rs => controller_upcall_client.rs} (80%)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index d15a0e47a4..e9e52acee6 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -15,7 +15,7 @@ use clap::{Arg, ArgAction, Command};
 
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::config::PageserverIdentity;
-use pageserver::control_plane_client::ControlPlaneClient;
+use pageserver::controller_upcall_client::ControllerUpcallClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
@@ -396,7 +396,7 @@ fn start_pageserver(
     // Set up deletion queue
     let (deletion_queue, deletion_workers) = DeletionQueue::new(
         remote_storage.clone(),
-        ControlPlaneClient::new(conf, &shutdown_pageserver),
+        ControllerUpcallClient::new(conf, &shutdown_pageserver),
         conf,
     );
     if let Some(deletion_workers) = deletion_workers {
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/controller_upcall_client.rs
similarity index 80%
rename from pageserver/src/control_plane_client.rs
rename to pageserver/src/controller_upcall_client.rs
index d0a967b920..73fc6dc3ab 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -17,9 +17,12 @@ use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
 use crate::{config::PageServerConf, virtual_file::on_fatal_io_error};
 use pageserver_api::config::NodeMetadata;
 
-/// The Pageserver's client for using the control plane API: this is a small subset
-/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
-pub struct ControlPlaneClient {
+/// The Pageserver's client for using the storage controller upcall API: this is a small API
+/// for dealing with generations (see docs/rfcs/025-generation-numbers.md).
+///
+/// The server presenting this API may either be the storage controller or some other
+/// service (such as the Neon control plane) providing a store of generation numbers.
+pub struct ControllerUpcallClient {
     http_client: reqwest::Client,
     base_url: Url,
     node_id: NodeId,
@@ -45,7 +48,7 @@ pub trait ControlPlaneGenerationsApi {
     ) -> impl Future<Output = Result<HashMap<TenantShardId, bool>, RetryForeverError>> + Send;
 }
 
-impl ControlPlaneClient {
+impl ControllerUpcallClient {
     /// A None return value indicates that the input `conf` object does not have control
     /// plane API enabled.
     pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
@@ -114,7 +117,7 @@ impl ControlPlaneClient {
     }
 }
 
-impl ControlPlaneGenerationsApi for ControlPlaneClient {
+impl ControlPlaneGenerationsApi for ControllerUpcallClient {
     /// Block until we get a successful response, or error out if we are shut down
     async fn re_attach(
         &self,
@@ -216,29 +219,38 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
             .join("validate")
             .expect("Failed to build validate path");
 
-        let request = ValidateRequest {
-            tenants: tenants
-                .into_iter()
-                .map(|(id, gen)| ValidateRequestTenant {
-                    id,
-                    gen: gen
-                        .into()
-                        .expect("Generation should always be valid for a Tenant doing deletions"),
-                })
-                .collect(),
-        };
+        // When sending validate requests, break them up into chunks so that we
+        // avoid possible edge cases of generating any HTTP requests that
+        // require database I/O across many thousands of tenants.
+        let mut result: HashMap<TenantShardId, bool> = HashMap::with_capacity(tenants.len());
+        for tenant_chunk in (tenants).chunks(128) {
+            let request = ValidateRequest {
+                tenants: tenant_chunk
+                    .iter()
+                    .map(|(id, generation)| ValidateRequestTenant {
+                        id: *id,
+                        gen: (*generation).into().expect(
+                            "Generation should always be valid for a Tenant doing deletions",
+                        ),
+                    })
+                    .collect(),
+            };
 
-        failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
-        if self.cancel.is_cancelled() {
-            return Err(RetryForeverError::ShuttingDown);
+            failpoint_support::sleep_millis_async!(
+                "control-plane-client-validate-sleep",
+                &self.cancel
+            );
+            if self.cancel.is_cancelled() {
+                return Err(RetryForeverError::ShuttingDown);
+            }
+
+            let response: ValidateResponse =
+                self.retry_http_forever(&re_attach_path, request).await?;
+            for rt in response.tenants {
+                result.insert(rt.id, rt.valid);
+            }
         }
 
-        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
-
-        Ok(response
-            .tenants
-            .into_iter()
-            .map(|rt| (rt.id, rt.valid))
-            .collect())
+        Ok(result.into_iter().collect())
     }
 }
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 22f7d5b824..73bdc90213 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -6,7 +6,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::Duration;
 
-use crate::control_plane_client::ControlPlaneGenerationsApi;
+use crate::controller_upcall_client::ControlPlaneGenerationsApi;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
@@ -622,7 +622,7 @@ impl DeletionQueue {
     /// If remote_storage is None, then the returned workers will also be None.
     pub fn new<C>(
         remote_storage: GenericRemoteStorage,
-        control_plane_client: Option<C>,
+        controller_upcall_client: Option<C>,
         conf: &'static PageServerConf,
     ) -> (Self, Option<DeletionQueueWorkers<C>>)
     where
@@ -662,7 +662,7 @@ impl DeletionQueue {
                     conf,
                     backend_rx,
                     executor_tx,
-                    control_plane_client,
+                    controller_upcall_client,
                     lsn_table.clone(),
                     cancel.clone(),
                 ),
@@ -704,7 +704,7 @@ mod test {
     use tokio::task::JoinHandle;
 
     use crate::{
-        control_plane_client::RetryForeverError,
+        controller_upcall_client::RetryForeverError,
         repository::Key,
         tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
     };
diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs
index d215fd2b7d..1d55581ebd 100644
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -25,8 +25,8 @@ use tracing::info;
 use tracing::warn;
 
 use crate::config::PageServerConf;
-use crate::control_plane_client::ControlPlaneGenerationsApi;
-use crate::control_plane_client::RetryForeverError;
+use crate::controller_upcall_client::ControlPlaneGenerationsApi;
+use crate::controller_upcall_client::RetryForeverError;
 use crate::metrics;
 use crate::virtual_file::MaybeFatalIo;
 
@@ -61,7 +61,7 @@ where
     tx: tokio::sync::mpsc::Sender<DeleterMessage>,
 
     // Client for calling into control plane API for validation of deletes
-    control_plane_client: Option<C>,
+    controller_upcall_client: Option<C>,
 
     // DeletionLists which are waiting generation validation.  Not safe to
     // execute until [`validate`] has processed them.
@@ -94,7 +94,7 @@ where
         conf: &'static PageServerConf,
         rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
         tx: tokio::sync::mpsc::Sender<DeleterMessage>,
-        control_plane_client: Option<C>,
+        controller_upcall_client: Option<C>,
         lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
         cancel: CancellationToken,
     ) -> Self {
@@ -102,7 +102,7 @@ where
             conf,
             rx,
             tx,
-            control_plane_client,
+            controller_upcall_client,
             lsn_table,
             pending_lists: Vec::new(),
             validated_lists: Vec::new(),
@@ -145,8 +145,8 @@ where
             return Ok(());
         }
 
-        let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
-            match control_plane_client
+        let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client {
+            match controller_upcall_client
                 .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
                 .await
             {
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 7a9cf495c7..08abfbd647 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -6,7 +6,7 @@ pub mod basebackup;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
-pub mod control_plane_client;
+pub mod controller_upcall_client;
 pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 1e7c1e10a5..e159ae186d 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -30,8 +30,8 @@ use utils::{backoff, completion, crashsafe};
 
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::control_plane_client::{
-    ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
+use crate::controller_upcall_client::{
+    ControlPlaneGenerationsApi, ControllerUpcallClient, RetryForeverError,
 };
 use crate::deletion_queue::DeletionQueueClient;
 use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
@@ -122,7 +122,7 @@ pub(crate) enum ShardSelector {
     Known(ShardIndex),
 }
 
-/// A convenience for use with the re_attach ControlPlaneClient function: rather
+/// A convenience for use with the re_attach ControllerUpcallClient function: rather
 /// than the serializable struct, we build this enum that encapsulates
 /// the invariant that attached tenants always have generations.
 ///
@@ -341,7 +341,7 @@ async fn init_load_generations(
             "Emergency mode!  Tenants will be attached unsafely using their last known generation"
         );
         emergency_generations(tenant_confs)
-    } else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
+    } else if let Some(client) = ControllerUpcallClient::new(conf, cancel) {
         info!("Calling control plane API to re-attach tenants");
         // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
         match client.re_attach(conf).await {

From 42e19e952fedefd9b591f7dc951e7d934990508c Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 26 Sep 2024 11:38:19 -0400
Subject: [PATCH 12/29] fix(pageserver): categorize client error in basebackup
 metrics (#9110)

We separated client error from basebackup error log lines in
https://github.com/neondatabase/neon/pull/7523, but we didn't do
anything for the metrics. In this patch, we fixed it.

ref https://github.com/neondatabase/neon/issues/8970

## Summary of changes

We use the same criteria as in `log_query_error` producing an info line
(instead of error) for the metrics. We added a `client_error` category
for the basebackup query time metrics.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/postgres_backend/src/lib.rs |  1 +
 pageserver/src/metrics.rs        | 19 ++++++++++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index e274d24585..085540e7b9 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -984,6 +984,7 @@ pub fn short_error(e: &QueryError) -> String {
 }
 
 fn log_query_error(query: &str, e: &QueryError) {
+    // If you want to change the log level of a specific error, also re-categorize it in `BasebackupQueryTimeOngoingRecording`.
     match e {
         QueryError::Disconnected(ConnectionError::Io(io_error)) => {
             if is_expected_io_error(io_error) {
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 366bd82903..b76efa5b48 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -8,6 +8,8 @@ use metrics::{
 };
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
+use postgres_backend::{is_expected_io_error, QueryError};
+use pq_proto::framed::ConnectionError;
 use strum::{EnumCount, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
 use tracing::warn;
@@ -1508,6 +1510,7 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
 pub(crate) struct BasebackupQueryTime {
     ok: Histogram,
     error: Histogram,
+    client_error: Histogram,
 }
 
 pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
@@ -1521,6 +1524,7 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
     BasebackupQueryTime {
         ok: vec.get_metric_with_label_values(&["ok"]).unwrap(),
         error: vec.get_metric_with_label_values(&["error"]).unwrap(),
+        client_error: vec.get_metric_with_label_values(&["client_error"]).unwrap(),
     }
 });
 
@@ -1557,7 +1561,7 @@ impl BasebackupQueryTime {
 }
 
 impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
-    pub(crate) fn observe<T, E>(self, res: &Result<T, E>) {
+    pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
         let elapsed = self.start.elapsed();
         let ex_throttled = self
             .ctx
@@ -1576,10 +1580,15 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
                 elapsed
             }
         };
-        let metric = if res.is_ok() {
-            &self.parent.ok
-        } else {
-            &self.parent.error
+        // If you want to change categorize of a specific error, also change it in `log_query_error`.
+        let metric = match res {
+            Ok(_) => &self.parent.ok,
+            Err(QueryError::Disconnected(ConnectionError::Io(io_error)))
+                if is_expected_io_error(io_error) =>
+            {
+                &self.parent.client_error
+            }
+            Err(_) => &self.parent.error,
         };
         metric.observe(ex_throttled.as_secs_f64());
     }

From c4cdfe66ac93764a7d8e2f71f95626cd38ea2b43 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 19 Sep 2024 16:37:17 +0300
Subject: [PATCH 13/29] Fix flakiness of test_timeline_copy.

Timeline might be not initialized when timeline_start_lsn is
queried. Spotted by CI.
---
 test_runner/fixtures/safekeeper/http.py  | 11 +++++++++++
 test_runner/regress/test_wal_acceptor.py |  9 +++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 96c84d1616..7f170eeea3 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -8,6 +8,7 @@ import requests
 from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
+from fixtures.utils import wait_until
 
 
 # Walreceiver as returned by sk's timeline status endpoint.
@@ -161,6 +162,16 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
             walreceivers=walreceivers,
         )
 
+    # Get timeline_start_lsn, waiting until it's nonzero. It is a way to ensure
+    # that the timeline is fully initialized at the safekeeper.
+    def get_non_zero_timeline_start_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
+        def timeline_start_lsn_non_zero() -> Lsn:
+            s = self.timeline_status(tenant_id, timeline_id).timeline_start_lsn
+            assert s > Lsn(0)
+            return s
+
+        return wait_until(30, 1, timeline_start_lsn_non_zero)
+
     def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
         return self.timeline_status(tenant_id, timeline_id).commit_lsn
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 8ee548bdb0..adfe292c24 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2084,8 +2084,13 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int):
 
     endpoint.safe_psql("create table t(key int, value text)")
 
-    timeline_status = env.safekeepers[0].http_client().timeline_status(tenant_id, timeline_id)
-    timeline_start_lsn = timeline_status.timeline_start_lsn
+    # Note: currently timelines on sks are created by compute and commit of
+    # transaction above is finished when 2/3 sks received it, so there is a
+    # small chance that timeline on this sk is not created/initialized yet,
+    # hence the usage of waiting function to prevent flakiness.
+    timeline_start_lsn = (
+        env.safekeepers[0].http_client().get_non_zero_timeline_start_lsn(tenant_id, timeline_id)
+    )
     log.info(f"Timeline start LSN: {timeline_start_lsn}")
 
     current_percent = 0.0

From ec07a1ecc9d409f1a429763f8185ce79c597f760 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 26 Sep 2024 19:01:48 +0100
Subject: [PATCH 14/29] proxy: make local-proxy config by signal with PID,
 refine JWKS apis with role caching (#9164)

---
 Cargo.lock                      |   1 +
 libs/compute_api/src/spec.rs    |  16 +++
 proxy/Cargo.toml                |   1 +
 proxy/src/auth/backend/jwt.rs   | 166 +++++++++++++++++---------
 proxy/src/auth/backend/local.rs |  18 +--
 proxy/src/bin/local_proxy.rs    | 201 +++++++++++++++++++-------------
 proxy/src/bin/pg_sni_router.rs  |   4 +-
 proxy/src/bin/proxy.rs          |   5 +-
 proxy/src/console/messages.rs   |  12 +-
 proxy/src/intern.rs             |   4 +-
 proxy/src/lib.rs                |   9 +-
 proxy/src/scram/threadpool.rs   |   7 ++
 proxy/src/serverless/backend.rs |   2 +-
 13 files changed, 272 insertions(+), 174 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d0702e09d4..70a8af4e7d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4296,6 +4296,7 @@ dependencies = [
  "camino-tempfile",
  "chrono",
  "clap",
+ "compute_api",
  "consumption_metrics",
  "dashmap",
  "ecdsa 0.16.9",
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 883c624f71..525a1572ff 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -268,6 +268,22 @@ pub struct GenericOption {
 /// declare a `trait` on it.
 pub type GenericOptions = Option<Vec<GenericOption>>;
 
+/// Configured the local-proxy application with the relevant JWKS and roles it should
+/// use for authorizing connect requests using JWT.
+#[derive(Clone, Debug, Deserialize, Serialize)]
+pub struct LocalProxySpec {
+    pub jwks: Vec<JwksSettings>,
+}
+
+#[derive(Clone, Debug, Deserialize, Serialize)]
+pub struct JwksSettings {
+    pub id: String,
+    pub role_names: Vec<String>,
+    pub jwks_url: String,
+    pub provider_name: String,
+    pub jwt_audience: Option<String>,
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 501ce050e0..04e0f9d4f5 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -24,6 +24,7 @@ bytes = { workspace = true, features = ["serde"] }
 camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
+compute_api.workspace = true
 consumption_metrics.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 94e5999a5f..ab848551a9 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -12,7 +12,10 @@ use serde::{Deserialize, Deserializer};
 use signature::Verifier;
 use tokio::time::Instant;
 
-use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName};
+use crate::{
+    context::RequestMonitoring, http::parse_json_body_with_limit, intern::RoleNameInt, EndpointId,
+    RoleName,
+};
 
 // TODO(conrad): make these configurable.
 const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
@@ -27,7 +30,6 @@ pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
-        role_name: RoleName,
     ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
 }
 
@@ -35,10 +37,11 @@ pub(crate) struct AuthRule {
     pub(crate) id: String,
     pub(crate) jwks_url: url::Url,
     pub(crate) audience: Option<String>,
+    pub(crate) role_names: Vec<RoleNameInt>,
 }
 
 #[derive(Default)]
-pub(crate) struct JwkCache {
+pub struct JwkCache {
     client: reqwest::Client,
 
     map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
@@ -54,18 +57,28 @@ pub(crate) struct JwkCacheEntry {
 }
 
 impl JwkCacheEntry {
-    fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
-        self.key_sets.values().find_map(|key_set| {
-            key_set
-                .find_key(key_id)
-                .map(|jwk| (jwk, key_set.audience.as_deref()))
-        })
+    fn find_jwk_and_audience(
+        &self,
+        key_id: &str,
+        role_name: &RoleName,
+    ) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
+        self.key_sets
+            .values()
+            // make sure our requested role has access to the key set
+            .filter(|key_set| key_set.role_names.iter().any(|role| **role == **role_name))
+            // try and find the requested key-id in the key set
+            .find_map(|key_set| {
+                key_set
+                    .find_key(key_id)
+                    .map(|jwk| (jwk, key_set.audience.as_deref()))
+            })
     }
 }
 
 struct KeySet {
     jwks: jose_jwk::JwkSet,
     audience: Option<String>,
+    role_names: Vec<RoleNameInt>,
 }
 
 impl KeySet {
@@ -106,7 +119,6 @@ impl JwkCacheEntryLock {
         ctx: &RequestMonitoring,
         client: &reqwest::Client,
         endpoint: EndpointId,
-        role_name: RoleName,
         auth_rules: &F,
     ) -> anyhow::Result<Arc<JwkCacheEntry>> {
         // double check that no one beat us to updating the cache.
@@ -119,11 +131,10 @@ impl JwkCacheEntryLock {
             }
         }
 
-        let rules = auth_rules
-            .fetch_auth_rules(ctx, endpoint, role_name)
-            .await?;
+        let rules = auth_rules.fetch_auth_rules(ctx, endpoint).await?;
         let mut key_sets =
             ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
+
         // TODO(conrad): run concurrently
         // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
         for rule in rules {
@@ -151,6 +162,7 @@ impl JwkCacheEntryLock {
                                 KeySet {
                                     jwks,
                                     audience: rule.audience,
+                                    role_names: rule.role_names,
                                 },
                             );
                         }
@@ -173,7 +185,6 @@ impl JwkCacheEntryLock {
         ctx: &RequestMonitoring,
         client: &reqwest::Client,
         endpoint: EndpointId,
-        role_name: RoleName,
         fetch: &F,
     ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
         let now = Instant::now();
@@ -183,9 +194,7 @@ impl JwkCacheEntryLock {
         let Some(cached) = guard else {
             let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
             let permit = self.acquire_permit().await;
-            return self
-                .renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
-                .await;
+            return self.renew_jwks(permit, ctx, client, endpoint, fetch).await;
         };
 
         let last_update = now.duration_since(cached.last_retrieved);
@@ -196,9 +205,7 @@ impl JwkCacheEntryLock {
             let permit = self.acquire_permit().await;
 
             // it's been too long since we checked the keys. wait for them to update.
-            return self
-                .renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
-                .await;
+            return self.renew_jwks(permit, ctx, client, endpoint, fetch).await;
         }
 
         // every 5 minutes we should spawn a job to eagerly update the token.
@@ -212,7 +219,7 @@ impl JwkCacheEntryLock {
                 let ctx = ctx.clone();
                 tokio::spawn(async move {
                     if let Err(e) = entry
-                        .renew_jwks(permit, &ctx, &client, endpoint, role_name, &fetch)
+                        .renew_jwks(permit, &ctx, &client, endpoint, &fetch)
                         .await
                     {
                         tracing::warn!(error=?e, "could not fetch JWKs in background job");
@@ -232,7 +239,7 @@ impl JwkCacheEntryLock {
         jwt: &str,
         client: &reqwest::Client,
         endpoint: EndpointId,
-        role_name: RoleName,
+        role_name: &RoleName,
         fetch: &F,
     ) -> Result<(), anyhow::Error> {
         // JWT compact form is defined to be
@@ -254,30 +261,26 @@ impl JwkCacheEntryLock {
         let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
             .context("Provided authentication token is not a valid JWT encoding")?;
 
-        ensure!(header.typ == "JWT");
+        ensure!(
+            header.typ == "JWT",
+            "Provided authentication token is not a valid JWT encoding"
+        );
         let kid = header.key_id.context("missing key id")?;
 
         let mut guard = self
-            .get_or_update_jwk_cache(ctx, client, endpoint.clone(), role_name.clone(), fetch)
+            .get_or_update_jwk_cache(ctx, client, endpoint.clone(), fetch)
             .await?;
 
         // get the key from the JWKs if possible. If not, wait for the keys to update.
         let (jwk, expected_audience) = loop {
-            match guard.find_jwk_and_audience(kid) {
+            match guard.find_jwk_and_audience(kid, role_name) {
                 Some(jwk) => break jwk,
                 None if guard.last_retrieved.elapsed() > MIN_RENEW => {
                     let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
 
                     let permit = self.acquire_permit().await;
                     guard = self
-                        .renew_jwks(
-                            permit,
-                            ctx,
-                            client,
-                            endpoint.clone(),
-                            role_name.clone(),
-                            fetch,
-                        )
+                        .renew_jwks(permit, ctx, client, endpoint.clone(), fetch)
                         .await?;
                 }
                 _ => {
@@ -320,11 +323,14 @@ impl JwkCacheEntryLock {
         let now = SystemTime::now();
 
         if let Some(exp) = payload.expiration {
-            ensure!(now < exp + CLOCK_SKEW_LEEWAY);
+            ensure!(now < exp + CLOCK_SKEW_LEEWAY, "JWT token has expired");
         }
 
         if let Some(nbf) = payload.not_before {
-            ensure!(nbf < now + CLOCK_SKEW_LEEWAY);
+            ensure!(
+                nbf < now + CLOCK_SKEW_LEEWAY,
+                "JWT token is not yet ready to use"
+            );
         }
 
         Ok(())
@@ -336,7 +342,7 @@ impl JwkCache {
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
-        role_name: RoleName,
+        role_name: &RoleName,
         fetch: &F,
         jwt: &str,
     ) -> Result<(), anyhow::Error> {
@@ -572,7 +578,7 @@ mod tests {
         format!("{header}.{body}")
     }
 
-    fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String {
+    fn new_ec_jwt(kid: String, key: &p256::SecretKey) -> String {
         use p256::ecdsa::{Signature, SigningKey};
 
         let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256);
@@ -660,11 +666,6 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
         let (ec1, jwk3) = new_ec_jwk("3".into());
         let (ec2, jwk4) = new_ec_jwk("4".into());
 
-        let jwt1 = new_rsa_jwt("1".into(), rs1);
-        let jwt2 = new_rsa_jwt("2".into(), rs2);
-        let jwt3 = new_ec_jwt("3".into(), ec1);
-        let jwt4 = new_ec_jwt("4".into(), ec2);
-
         let foo_jwks = jose_jwk::JwkSet {
             keys: vec![jwk1, jwk3],
         };
@@ -706,47 +707,98 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
         let client = reqwest::Client::new();
 
         #[derive(Clone)]
-        struct Fetch(SocketAddr);
+        struct Fetch(SocketAddr, Vec<RoleNameInt>);
 
         impl FetchAuthRules for Fetch {
             async fn fetch_auth_rules(
                 &self,
                 _ctx: &RequestMonitoring,
                 _endpoint: EndpointId,
-                _role_name: RoleName,
             ) -> anyhow::Result<Vec<AuthRule>> {
                 Ok(vec![
                     AuthRule {
                         id: "foo".to_owned(),
                         jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
                         audience: None,
+                        role_names: self.1.clone(),
                     },
                     AuthRule {
                         id: "bar".to_owned(),
                         jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
                         audience: None,
+                        role_names: self.1.clone(),
                     },
                 ])
             }
         }
 
-        let role_name = RoleName::from("user");
+        let role_name1 = RoleName::from("anonymous");
+        let role_name2 = RoleName::from("authenticated");
+
+        let fetch = Fetch(
+            addr,
+            vec![
+                RoleNameInt::from(&role_name1),
+                RoleNameInt::from(&role_name2),
+            ],
+        );
+
         let endpoint = EndpointId::from("ep");
 
         let jwk_cache = Arc::new(JwkCacheEntryLock::default());
 
-        for token in [jwt1, jwt2, jwt3, jwt4] {
-            jwk_cache
-                .check_jwt(
-                    &RequestMonitoring::test(),
-                    &token,
-                    &client,
-                    endpoint.clone(),
-                    role_name.clone(),
-                    &Fetch(addr),
-                )
-                .await
-                .unwrap();
+        let jwt1 = new_rsa_jwt("1".into(), rs1);
+        let jwt2 = new_rsa_jwt("2".into(), rs2);
+        let jwt3 = new_ec_jwt("3".into(), &ec1);
+        let jwt4 = new_ec_jwt("4".into(), &ec2);
+
+        // had the wrong kid, therefore will have the wrong ecdsa signature
+        let bad_jwt = new_ec_jwt("3".into(), &ec2);
+        // this role_name is not accepted
+        let bad_role_name = RoleName::from("cloud_admin");
+
+        let err = jwk_cache
+            .check_jwt(
+                &RequestMonitoring::test(),
+                &bad_jwt,
+                &client,
+                endpoint.clone(),
+                &role_name1,
+                &fetch,
+            )
+            .await
+            .unwrap_err();
+        assert!(err.to_string().contains("signature error"));
+
+        let err = jwk_cache
+            .check_jwt(
+                &RequestMonitoring::test(),
+                &jwt1,
+                &client,
+                endpoint.clone(),
+                &bad_role_name,
+                &fetch,
+            )
+            .await
+            .unwrap_err();
+        assert!(err.to_string().contains("jwk not found"));
+
+        let tokens = [jwt1, jwt2, jwt3, jwt4];
+        let role_names = [role_name1, role_name2];
+        for role in &role_names {
+            for token in &tokens {
+                jwk_cache
+                    .check_jwt(
+                        &RequestMonitoring::test(),
+                        token,
+                        &client,
+                        endpoint.clone(),
+                        role,
+                        &fetch,
+                    )
+                    .await
+                    .unwrap();
+            }
         }
     }
 }
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 2ff2ca00f0..2ab53f2c6a 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -1,4 +1,4 @@
-use std::{collections::HashMap, net::SocketAddr};
+use std::net::SocketAddr;
 
 use anyhow::Context;
 use arc_swap::ArcSwapOption;
@@ -10,8 +10,8 @@ use crate::{
         NodeInfo,
     },
     context::RequestMonitoring,
-    intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag},
-    EndpointId, RoleName,
+    intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag},
+    EndpointId,
 };
 
 use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
@@ -48,26 +48,17 @@ impl LocalBackend {
 #[derive(Clone, Copy)]
 pub(crate) struct StaticAuthRules;
 
-pub static JWKS_ROLE_MAP: ArcSwapOption<JwksRoleSettings> = ArcSwapOption::const_empty();
-
-#[derive(Debug, Clone)]
-pub struct JwksRoleSettings {
-    pub roles: HashMap<RoleName, EndpointJwksResponse>,
-    pub project_id: ProjectIdInt,
-    pub branch_id: BranchIdInt,
-}
+pub static JWKS_ROLE_MAP: ArcSwapOption<EndpointJwksResponse> = ArcSwapOption::const_empty();
 
 impl FetchAuthRules for StaticAuthRules {
     async fn fetch_auth_rules(
         &self,
         _ctx: &RequestMonitoring,
         _endpoint: EndpointId,
-        role_name: RoleName,
     ) -> anyhow::Result<Vec<AuthRule>> {
         let mappings = JWKS_ROLE_MAP.load();
         let role_mappings = mappings
             .as_deref()
-            .and_then(|m| m.roles.get(&role_name))
             .context("JWKs settings for this role were not configured")?;
         let mut rules = vec![];
         for setting in &role_mappings.jwks {
@@ -75,6 +66,7 @@ impl FetchAuthRules for StaticAuthRules {
                 id: setting.id.clone(),
                 jwks_url: setting.jwks_url.clone(),
                 audience: setting.jwt_audience.clone(),
+                role_names: setting.role_names.clone(),
             });
         }
 
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 94365ddf05..1b3f465686 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -1,34 +1,35 @@
-use std::{
-    net::SocketAddr,
-    path::{Path, PathBuf},
-    pin::pin,
-    sync::Arc,
-    time::Duration,
-};
+use std::{net::SocketAddr, pin::pin, str::FromStr, sync::Arc, time::Duration};
 
-use anyhow::{bail, ensure};
+use anyhow::{bail, ensure, Context};
+use camino::{Utf8Path, Utf8PathBuf};
+use compute_api::spec::LocalProxySpec;
 use dashmap::DashMap;
-use futures::{future::Either, FutureExt};
+use futures::future::Either;
 use proxy::{
-    auth::backend::local::{JwksRoleSettings, LocalBackend, JWKS_ROLE_MAP},
+    auth::backend::local::{LocalBackend, JWKS_ROLE_MAP},
     cancellation::CancellationHandlerMain,
     config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig},
-    console::{locks::ApiLocks, messages::JwksRoleMapping},
+    console::{
+        locks::ApiLocks,
+        messages::{EndpointJwksResponse, JwksSettings},
+    },
     http::health_server::AppMetrics,
+    intern::RoleNameInt,
     metrics::{Metrics, ThreadPoolMetrics},
     rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo},
     scram::threadpool::ThreadPool,
     serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions},
+    RoleName,
 };
 
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
 use clap::Parser;
-use tokio::{net::TcpListener, task::JoinSet};
+use tokio::{net::TcpListener, sync::Notify, task::JoinSet};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
-use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
+use utils::{pid_file, project_build_tag, project_git_version, sentry_init::init_sentry};
 
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
@@ -72,9 +73,12 @@ struct LocalProxyCliArgs {
     /// Address of the postgres server
     #[clap(long, default_value = "127.0.0.1:5432")]
     compute: SocketAddr,
-    /// File address of the local proxy config file
+    /// Path of the local proxy config file
     #[clap(long, default_value = "./localproxy.json")]
-    config_path: PathBuf,
+    config_path: Utf8PathBuf,
+    /// Path of the local proxy PID file
+    #[clap(long, default_value = "./localproxy.pid")]
+    pid_path: Utf8PathBuf,
 }
 
 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -126,6 +130,24 @@ async fn main() -> anyhow::Result<()> {
     let args = LocalProxyCliArgs::parse();
     let config = build_config(&args)?;
 
+    // before we bind to any ports, write the process ID to a file
+    // so that compute-ctl can find our process later
+    // in order to trigger the appropriate SIGHUP on config change.
+    //
+    // This also claims a "lock" that makes sure only one instance
+    // of local-proxy runs at a time.
+    let _process_guard = loop {
+        match pid_file::claim_for_current_process(&args.pid_path) {
+            Ok(guard) => break guard,
+            Err(e) => {
+                // compute-ctl might have tried to read the pid-file to let us
+                // know about some config change. We should try again.
+                error!(path=?args.pid_path, "could not claim PID file guard: {e:?}");
+                tokio::time::sleep(Duration::from_secs(1)).await;
+            }
+        }
+    };
+
     let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?;
     let http_listener = TcpListener::bind(args.http).await?;
     let shutdown = CancellationToken::new();
@@ -139,12 +161,30 @@ async fn main() -> anyhow::Result<()> {
         16,
     ));
 
-    refresh_config(args.config_path.clone()).await;
+    // write the process ID to a file so that compute-ctl can find our process later
+    // in order to trigger the appropriate SIGHUP on config change.
+    let pid = std::process::id();
+    info!("process running in PID {pid}");
+    std::fs::write(args.pid_path, format!("{pid}\n")).context("writing PID to file")?;
 
     let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), move || {
-        refresh_config(args.config_path.clone()).map(Ok)
+
+    let refresh_config_notify = Arc::new(Notify::new());
+    maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), {
+        let refresh_config_notify = Arc::clone(&refresh_config_notify);
+        move || {
+            refresh_config_notify.notify_one();
+        }
     }));
+
+    // trigger the first config load **after** setting up the signal hook
+    // to avoid the race condition where:
+    // 1. No config file registered when local-proxy starts up
+    // 2. The config file is written but the signal hook is not yet received
+    // 3. local-proxy completes startup but has no config loaded, despite there being a registerd config.
+    refresh_config_notify.notify_one();
+    tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify));
+
     maintenance_tasks.spawn(proxy::http::health_server::task_main(
         metrics_listener,
         AppMetrics {
@@ -245,81 +285,84 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
     })))
 }
 
-async fn refresh_config(path: PathBuf) {
-    match refresh_config_inner(&path).await {
-        Ok(()) => {}
-        Err(e) => {
-            error!(error=?e, ?path, "could not read config file");
+async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc<Notify>) {
+    loop {
+        rx.notified().await;
+
+        match refresh_config_inner(&path).await {
+            Ok(()) => {}
+            Err(e) => {
+                error!(error=?e, ?path, "could not read config file");
+            }
         }
     }
 }
 
-async fn refresh_config_inner(path: &Path) -> anyhow::Result<()> {
+async fn refresh_config_inner(path: &Utf8Path) -> anyhow::Result<()> {
     let bytes = tokio::fs::read(&path).await?;
-    let mut data: JwksRoleMapping = serde_json::from_slice(&bytes)?;
+    let data: LocalProxySpec = serde_json::from_slice(&bytes)?;
 
-    let mut settings = None;
+    let mut jwks_set = vec![];
 
-    for mapping in data.roles.values_mut() {
-        for jwks in &mut mapping.jwks {
-            ensure!(
-                jwks.jwks_url.has_authority()
-                    && (jwks.jwks_url.scheme() == "http" || jwks.jwks_url.scheme() == "https"),
-                "Invalid JWKS url. Must be HTTP",
-            );
+    for jwks in data.jwks {
+        let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?;
 
-            ensure!(
-                jwks.jwks_url
-                    .host()
-                    .is_some_and(|h| h != url::Host::Domain("")),
-                "Invalid JWKS url. No domain listed",
-            );
+        ensure!(
+            jwks_url.has_authority()
+                && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"),
+            "Invalid JWKS url. Must be HTTP",
+        );
 
-            // clear username, password and ports
-            jwks.jwks_url.set_username("").expect(
+        ensure!(
+            jwks_url.host().is_some_and(|h| h != url::Host::Domain("")),
+            "Invalid JWKS url. No domain listed",
+        );
+
+        // clear username, password and ports
+        jwks_url
+            .set_username("")
+            .expect("url can be a base and has a valid host and is not a file. should not error");
+        jwks_url
+            .set_password(None)
+            .expect("url can be a base and has a valid host and is not a file. should not error");
+        // local testing is hard if we need to have a specific restricted port
+        if cfg!(not(feature = "testing")) {
+            jwks_url.set_port(None).expect(
                 "url can be a base and has a valid host and is not a file. should not error",
             );
-            jwks.jwks_url.set_password(None).expect(
-                "url can be a base and has a valid host and is not a file. should not error",
-            );
-            // local testing is hard if we need to have a specific restricted port
-            if cfg!(not(feature = "testing")) {
-                jwks.jwks_url.set_port(None).expect(
-                    "url can be a base and has a valid host and is not a file. should not error",
-                );
-            }
-
-            // clear query params
-            jwks.jwks_url.set_fragment(None);
-            jwks.jwks_url.query_pairs_mut().clear().finish();
-
-            if jwks.jwks_url.scheme() != "https" {
-                // local testing is hard if we need to set up https support.
-                if cfg!(not(feature = "testing")) {
-                    jwks.jwks_url
-                        .set_scheme("https")
-                        .expect("should not error to set the scheme to https if it was http");
-                } else {
-                    warn!(scheme = jwks.jwks_url.scheme(), "JWKS url is not HTTPS");
-                }
-            }
-
-            let (pr, br) = settings.get_or_insert((jwks.project_id, jwks.branch_id));
-            ensure!(
-                *pr == jwks.project_id,
-                "inconsistent project IDs configured"
-            );
-            ensure!(*br == jwks.branch_id, "inconsistent branch IDs configured");
         }
+
+        // clear query params
+        jwks_url.set_fragment(None);
+        jwks_url.query_pairs_mut().clear().finish();
+
+        if jwks_url.scheme() != "https" {
+            // local testing is hard if we need to set up https support.
+            if cfg!(not(feature = "testing")) {
+                jwks_url
+                    .set_scheme("https")
+                    .expect("should not error to set the scheme to https if it was http");
+            } else {
+                warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS");
+            }
+        }
+
+        jwks_set.push(JwksSettings {
+            id: jwks.id,
+            jwks_url,
+            provider_name: jwks.provider_name,
+            jwt_audience: jwks.jwt_audience,
+            role_names: jwks
+                .role_names
+                .into_iter()
+                .map(RoleName::from)
+                .map(|s| RoleNameInt::from(&s))
+                .collect(),
+        })
     }
 
-    if let Some((project_id, branch_id)) = settings {
-        JWKS_ROLE_MAP.store(Some(Arc::new(JwksRoleSettings {
-            roles: data.roles,
-            project_id,
-            branch_id,
-        })));
-    }
+    info!("successfully loaded new config");
+    JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set })));
 
     Ok(())
 }
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 20d2d3df9a..53f1586abe 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -133,9 +133,7 @@ async fn main() -> anyhow::Result<()> {
         proxy_listener,
         cancellation_token.clone(),
     ));
-    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || async {
-        Ok(())
-    }));
+    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || {}));
 
     // the signal task cant ever succeed.
     // the main task can error, or can succeed on cancellation.
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 2ac66ffe8c..141005788d 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -461,10 +461,7 @@ async fn main() -> anyhow::Result<()> {
 
     // maintenance tasks. these never return unless there's an error
     let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(
-        cancellation_token.clone(),
-        || async { Ok(()) },
-    ));
+    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone(), || {}));
     maintenance_tasks.spawn(http::health_server::task_main(
         http_listener,
         AppMetrics {
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 85683acb82..1696e229ce 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,13 +1,11 @@
 use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
 use std::fmt::{self, Display};
 
 use crate::auth::IpPattern;
 
-use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
+use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::proxy::retry::CouldRetry;
-use crate::RoleName;
 
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
@@ -348,11 +346,6 @@ impl ColdStartInfo {
     }
 }
 
-#[derive(Debug, Deserialize, Clone)]
-pub struct JwksRoleMapping {
-    pub roles: HashMap<RoleName, EndpointJwksResponse>,
-}
-
 #[derive(Debug, Deserialize, Clone)]
 pub struct EndpointJwksResponse {
     pub jwks: Vec<JwksSettings>,
@@ -361,11 +354,10 @@ pub struct EndpointJwksResponse {
 #[derive(Debug, Deserialize, Clone)]
 pub struct JwksSettings {
     pub id: String,
-    pub project_id: ProjectIdInt,
-    pub branch_id: BranchIdInt,
     pub jwks_url: url::Url,
     pub provider_name: String,
     pub jwt_audience: Option<String>,
+    pub role_names: Vec<RoleNameInt>,
 }
 
 #[cfg(test)]
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index e5144cfe2e..108420d7d7 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -130,14 +130,14 @@ impl<Id: InternId> Default for StringInterner<Id> {
 }
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub(crate) struct RoleNameTag;
+pub struct RoleNameTag;
 impl InternId for RoleNameTag {
     fn get_interner() -> &'static StringInterner<Self> {
         static ROLE_NAMES: OnceLock<StringInterner<RoleNameTag>> = OnceLock::new();
         ROLE_NAMES.get_or_init(Default::default)
     }
 }
-pub(crate) type RoleNameInt = InternedString<RoleNameTag>;
+pub type RoleNameInt = InternedString<RoleNameTag>;
 impl From<&RoleName> for RoleNameInt {
     fn from(value: &RoleName) -> Self {
         RoleNameTag::get_interner().get_or_intern(value)
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 0070839aa8..ea0a9beced 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -82,7 +82,7 @@
     impl_trait_overcaptures,
 )]
 
-use std::{convert::Infallible, future::Future};
+use std::convert::Infallible;
 
 use anyhow::{bail, Context};
 use intern::{EndpointIdInt, EndpointIdTag, InternId};
@@ -117,13 +117,12 @@ pub mod usage_metrics;
 pub mod waiters;
 
 /// Handle unix signals appropriately.
-pub async fn handle_signals<F, Fut>(
+pub async fn handle_signals<F>(
     token: CancellationToken,
     mut refresh_config: F,
 ) -> anyhow::Result<Infallible>
 where
-    F: FnMut() -> Fut,
-    Fut: Future<Output = anyhow::Result<()>>,
+    F: FnMut(),
 {
     use tokio::signal::unix::{signal, SignalKind};
 
@@ -136,7 +135,7 @@ where
             // Hangup is commonly used for config reload.
             _ = hangup.recv() => {
                 warn!("received SIGHUP");
-                refresh_config().await?;
+                refresh_config();
             }
             // Shut down the whole application.
             _ = interrupt.recv() => {
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index 2702aeebfe..c027a0cd20 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -43,6 +43,13 @@ impl ThreadPool {
     pub fn new(n_workers: u8) -> Arc<Self> {
         // rayon would be nice here, but yielding in rayon does not work well afaict.
 
+        if n_workers == 0 {
+            return Arc::new(Self {
+                runtime: None,
+                metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
+            });
+        }
+
         Arc::new_cyclic(|pool| {
             let pool = pool.clone();
             let worker_id = AtomicUsize::new(0);
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index aa236907db..607eb0caf6 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -119,7 +119,7 @@ impl PoolingBackend {
                     .check_jwt(
                         ctx,
                         user_info.endpoint.clone(),
-                        user_info.user.clone(),
+                        &user_info.user,
                         &StaticAuthRules,
                         jwt,
                     )

From 40f7930a7da65325280dfc69f5940045b2147bc7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Fri, 27 Sep 2024 09:59:38 +0300
Subject: [PATCH 15/29] safekeeper: skip syncfs on start if --no-sync is
 specified. (#9166)

https://neondb.slack.com/archives/C059ZC138NR/p1727350911890989?thread_ts=1727350211.370869&cid=C059ZC138NR
---
 safekeeper/src/bin/safekeeper.rs | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 5270934f5e..1e5f963a4f 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -374,14 +374,16 @@ type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
 
 async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     // fsync the datadir to make sure we have a consistent state on disk.
-    let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
-    let started = Instant::now();
-    utils::crashsafe::syncfs(dfd)?;
-    let elapsed = started.elapsed();
-    info!(
-        elapsed_ms = elapsed.as_millis(),
-        "syncfs data directory done"
-    );
+    if !conf.no_sync {
+        let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
+        let started = Instant::now();
+        utils::crashsafe::syncfs(dfd)?;
+        let elapsed = started.elapsed();
+        info!(
+            elapsed_ms = elapsed.as_millis(),
+            "syncfs data directory done"
+        );
+    }
 
     info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
     let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {

From fa354a65ab51b424bec52ea604894f13682b05e5 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 27 Sep 2024 12:36:43 +0100
Subject: [PATCH 16/29] libs: improve logging on PG connection errors (#9130)

## Problem
We get some unexpected errors, but don't know who they're happening for.

## Summary of change
Add tenant id and peer address to PG connection error logs.

Related https://github.com/neondatabase/cloud/issues/17336
---
 pageserver/src/page_service.rs | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 9261b7481d..98d718dde4 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -273,10 +273,20 @@ async fn page_service_conn_main(
                 info!("Postgres client disconnected ({io_error})");
                 Ok(())
             } else {
-                Err(io_error).context("Postgres connection error")
+                let tenant_id = conn_handler.timeline_handles.tenant_id();
+                Err(io_error).context(format!(
+                    "Postgres connection error for tenant_id={:?} client at peer_addr={}",
+                    tenant_id, peer_addr
+                ))
             }
         }
-        other => other.context("Postgres query error"),
+        other => {
+            let tenant_id = conn_handler.timeline_handles.tenant_id();
+            other.context(format!(
+                "Postgres query error for tenant_id={:?} client peer_addr={}",
+                tenant_id, peer_addr
+            ))
+        }
     }
 }
 
@@ -340,6 +350,10 @@ impl TimelineHandles {
                 }
             })
     }
+
+    fn tenant_id(&self) -> Option<TenantId> {
+        self.wrapper.tenant_id.get().copied()
+    }
 }
 
 pub(crate) struct TenantManagerWrapper {

From 02cdd37b567551fd944e452738b1d9c76975bd68 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 27 Sep 2024 10:29:32 +0300
Subject: [PATCH 17/29] Dump backtrace if a core dump is called just "core"
 (#9125)

I hope this lets us capture backtraces in CI. At least it makes it
work on my laptop, which is valuable even if we need to do more for
CI.

See issue #2800.
---
 compute_tools/src/compute.rs | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1f47bb58a3..9ed2fc5143 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1123,6 +1123,9 @@ impl ComputeNode {
     //
     // Use that as a default location and pattern, except macos where core dumps are written
     // to /cores/ directory by default.
+    //
+    // With default Linux settings, the core dump file is called just "core", so check for
+    // that too.
     pub fn check_for_core_dumps(&self) -> Result<()> {
         let core_dump_dir = match std::env::consts::OS {
             "macos" => Path::new("/cores/"),
@@ -1134,8 +1137,17 @@ impl ComputeNode {
         let files = fs::read_dir(core_dump_dir)?;
         let cores = files.filter_map(|entry| {
             let entry = entry.ok()?;
-            let _ = entry.file_name().to_str()?.strip_prefix("core.")?;
-            Some(entry.path())
+
+            let is_core_dump = match entry.file_name().to_str()? {
+                n if n.starts_with("core.") => true,
+                "core" => true,
+                _ => false,
+            };
+            if is_core_dump {
+                Some(entry.path())
+            } else {
+                None
+            }
         });
 
         // Print backtrace for each core dump

From 357fa070a3313f833f2924bb7c096845bd52e37a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 27 Sep 2024 10:29:35 +0300
Subject: [PATCH 18/29] Add gdb to build-tools (#9125)

So that compute_ctl can use it to print backtrace on core dumps

See issue #2800.
---
 Dockerfile.build-tools | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index c4209c7a12..d8bcacf228 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -13,6 +13,9 @@ RUN useradd -ms /bin/bash nonroot -b /home
 SHELL ["/bin/bash", "-c"]
 
 # System deps
+#
+# 'gdb' is included so that we get backtraces of core dumps produced in
+# regression tests
 RUN set -e \
     && apt update \
     && apt install -y \
@@ -24,6 +27,7 @@ RUN set -e \
         cmake \
         curl \
         flex \
+        gdb \
         git \
         gnupg \
         gzip \

From fc962c9605c44246af218153240a32057acd5181 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 26 Sep 2024 14:44:14 -0500
Subject: [PATCH 19/29] Use long options when calling initdb

Verbosity in this case is good when reading the code. Short options are
better when operating in an interactive shell.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/compute.rs             | 2 +-
 control_plane/src/storage_controller.rs  | 2 +-
 libs/postgres_ffi/wal_craft/src/lib.rs   | 4 ++--
 pageserver/src/tenant.rs                 | 6 +++---
 test_runner/fixtures/neon_fixtures.py    | 2 +-
 test_runner/regress/test_wal_acceptor.py | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 9ed2fc5143..30310153f9 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -710,7 +710,7 @@ impl ComputeNode {
         info!("running initdb");
         let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
         Command::new(initdb_bin)
-            .args(["-D", pgdata])
+            .args(["--pgdata", pgdata])
             .output()
             .expect("cannot start initdb process");
 
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 0c0e67dff0..36e5e04c86 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -347,7 +347,7 @@ impl StorageController {
 
             if !tokio::fs::try_exists(&pg_data_path).await? {
                 let initdb_args = [
-                    "-D",
+                    "--pgdata",
                     pg_data_path.as_ref(),
                     "--username",
                     &username(),
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 5c0abda522..9524a5149b 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -93,9 +93,9 @@ impl Conf {
         );
         let output = self
             .new_pg_command("initdb")?
-            .arg("-D")
+            .arg("--pgdata")
             .arg(&self.datadir)
-            .args(["-U", "postgres", "--no-instructions", "--no-sync"])
+            .args(["--username", "postgres", "--no-instructions", "--no-sync"])
             .output()?;
         debug!("initdb output: {:?}", output);
         ensure!(
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2aebf4f999..e88cda0c9f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3887,9 +3887,9 @@ async fn run_initdb(
     let _permit = INIT_DB_SEMAPHORE.acquire().await;
 
     let initdb_command = tokio::process::Command::new(&initdb_bin_path)
-        .args(["-D", initdb_target_dir.as_ref()])
-        .args(["-U", &conf.superuser])
-        .args(["-E", "utf8"])
+        .args(["--pgdata", initdb_target_dir.as_ref()])
+        .args(["--username", &conf.superuser])
+        .args(["--encoding", "utf8"])
         .arg("--no-instructions")
         .arg("--no-sync")
         .env_clear()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 70fe632f49..70a038c960 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3311,7 +3311,7 @@ class VanillaPostgres(PgProtocol):
         self.pg_bin = pg_bin
         self.running = False
         if init:
-            self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)])
+            self.pg_bin.run_capture(["initdb", "--pgdata", str(pgdatadir)])
         self.configure([f"port = {port}\n"])
 
     def enable_tls(self):
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index adfe292c24..c75235a04b 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -772,7 +772,7 @@ class ProposerPostgres(PgProtocol):
     def initdb(self):
         """Run initdb"""
 
-        args = ["initdb", "-U", "cloud_admin", "-D", self.pg_data_dir_path()]
+        args = ["initdb", "--username", "cloud_admin", "--pgdata", self.pg_data_dir_path()]
         self.pg_bin.run(args)
 
     def start(self):

From 42ef08db470e24853fe4c77acb5fd84a1bcb2356 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Fri, 27 Sep 2024 09:56:52 -0400
Subject: [PATCH 20/29] fix(pageserver): LSN lease edge cases around
 restarts/migrations (#9055)

Part of #7497, closes #8817.

## Problem

See #8817.

## Summary of changes

**compute_ctl**

- Renew lsn lease as soon as `/configure` updates pageserver_connstr,
use `state_changed` Condvar for synchronization.

**pageserver**

As mentioned in
https://github.com/neondatabase/neon/issues/8817#issuecomment-2315768076,
we still want some permanent error reported if a lease cannot be
granted. By considering attachment mode and the added
`lsn_lease_deadline` when processing lease requests, we can also bound
the case of bad requests to a very short period after migration/restart.

- Refactor https://github.com/neondatabase/neon/pull/9024 and move
`lsn_lease_deadline` to `AttachedTenantConf` so timeline can easily
access it.
- Have separate HTTP `init_lsn_lease` and  libpq `renew_lsn_lease` API.
  - Always do LSN verification for the initial HTTP lease request.
- LSN verification for the renewal is **still done** when tenants are
not in `AttachedSingle` and we have pass the `lsn_lease_deadline`, which
give plenty of time for compute to renew the lease.

**neon_local**

- add and call `timeline_init_lsn_lease` mgmt_api at static endpoint
start. The initial lsn lease http request is sent when we run `cargo
neon endpoint start <static endpoint>`.


## Testing

- Extend `test_readonly_node_gc` to do pageserver restarts and
migration.

## Future Work

- The control plane should make the initial lease request through HTTP
when creating a static endpoint. This is currently only done in
`neon_local`.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 Cargo.lock                                    |   1 +
 compute_tools/src/compute.rs                  |  31 +++++
 compute_tools/src/lsn_lease.rs                |  13 +-
 control_plane/Cargo.toml                      |   1 +
 control_plane/src/bin/neon_local.rs           |  26 ++--
 pageserver/client/src/mgmt_api.rs             |  18 +++
 pageserver/src/http/routes.rs                 |  17 ++-
 pageserver/src/page_service.rs                |   2 +-
 pageserver/src/tenant.rs                      | 114 +++++++++++++-----
 pageserver/src/tenant/gc_block.rs             |  73 +++--------
 pageserver/src/tenant/mgr.rs                  |   6 -
 pageserver/src/tenant/tasks.rs                |   1 -
 pageserver/src/tenant/timeline.rs             |  69 ++++++++---
 test_runner/regress/test_readonly_node.py     |  51 +++++---
 .../regress/test_timeline_gc_blocking.py      |   5 +-
 15 files changed, 282 insertions(+), 146 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 70a8af4e7d..2bd828367c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1321,6 +1321,7 @@ dependencies = [
  "clap",
  "comfy-table",
  "compute_api",
+ "futures",
  "humantime",
  "humantime-serde",
  "hyper 0.14.30",
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 30310153f9..a6b7633eda 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -10,6 +10,7 @@ use std::sync::atomic::AtomicU32;
 use std::sync::atomic::Ordering;
 use std::sync::{Condvar, Mutex, RwLock};
 use std::thread;
+use std::time::Duration;
 use std::time::Instant;
 
 use anyhow::{Context, Result};
@@ -1398,6 +1399,36 @@ LIMIT 100",
         }
         Ok(remote_ext_metrics)
     }
+
+    /// Waits until current thread receives a state changed notification and
+    /// the pageserver connection strings has changed.
+    ///
+    /// The operation will time out after a specified duration.
+    pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
+        let state = self.state.lock().unwrap();
+        let old_pageserver_connstr = state
+            .pspec
+            .as_ref()
+            .expect("spec must be set")
+            .pageserver_connstr
+            .clone();
+        let mut unchanged = true;
+        let _ = self
+            .state_changed
+            .wait_timeout_while(state, duration, |s| {
+                let pageserver_connstr = &s
+                    .pspec
+                    .as_ref()
+                    .expect("spec must be set")
+                    .pageserver_connstr;
+                unchanged = pageserver_connstr == &old_pageserver_connstr;
+                unchanged
+            })
+            .unwrap();
+        if !unchanged {
+            info!("Pageserver config changed");
+        }
+    }
 }
 
 pub fn forward_termination_signal() {
diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs
index 7e5917c55f..3061d387a5 100644
--- a/compute_tools/src/lsn_lease.rs
+++ b/compute_tools/src/lsn_lease.rs
@@ -57,10 +57,10 @@ fn lsn_lease_bg_task(
             .max(valid_duration / 2);
 
         info!(
-            "Succeeded, sleeping for {} seconds",
+            "Request succeeded, sleeping for {} seconds",
             sleep_duration.as_secs()
         );
-        thread::sleep(sleep_duration);
+        compute.wait_timeout_while_pageserver_connstr_unchanged(sleep_duration);
     }
 }
 
@@ -89,10 +89,7 @@ fn acquire_lsn_lease_with_retry(
                 .map(|connstr| {
                     let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
                     if let Some(storage_auth_token) = &spec.storage_auth_token {
-                        info!("Got storage auth token from spec file");
                         config.password(storage_auth_token.clone());
-                    } else {
-                        info!("Storage auth token not set");
                     }
                     config
                 })
@@ -108,9 +105,11 @@ fn acquire_lsn_lease_with_retry(
                 bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
             }
             Err(e) => {
-                warn!("Failed to acquire lsn lease: {e} (attempt {attempts}");
+                warn!("Failed to acquire lsn lease: {e} (attempt {attempts})");
 
-                thread::sleep(Duration::from_millis(retry_period_ms as u64));
+                compute.wait_timeout_while_pageserver_connstr_unchanged(Duration::from_millis(
+                    retry_period_ms as u64,
+                ));
                 retry_period_ms *= 1.5;
                 retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
             }
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index df87c181bf..355eca0fe5 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -9,6 +9,7 @@ anyhow.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+futures.workspace = true
 humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 92f609761a..b6532ff2ac 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -894,17 +894,27 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 // to pass these on to postgres.
                 let storage_controller = StorageController::from_env(env);
                 let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
-                let pageservers = locate_result
-                    .shards
-                    .into_iter()
-                    .map(|shard| {
-                        (
+                let pageservers = futures::future::try_join_all(
+                    locate_result.shards.into_iter().map(|shard| async move {
+                        if let ComputeMode::Static(lsn) = endpoint.mode {
+                            // Initialize LSN leases for static computes.
+                            let conf = env.get_pageserver_conf(shard.node_id).unwrap();
+                            let pageserver = PageServerNode::from_env(env, conf);
+
+                            pageserver
+                                .http_client
+                                .timeline_init_lsn_lease(shard.shard_id, endpoint.timeline_id, lsn)
+                                .await?;
+                        }
+
+                        anyhow::Ok((
                             Host::parse(&shard.listen_pg_addr)
                                 .expect("Storage controller reported bad hostname"),
                             shard.listen_pg_port,
-                        )
-                    })
-                    .collect::<Vec<_>>();
+                        ))
+                    }),
+                )
+                .await?;
                 let stripe_size = locate_result.shard_params.stripe_size;
 
                 (pageservers, stripe_size)
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 2d95ac42e6..592f1ded0d 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -736,4 +736,22 @@ impl Client {
             .await
             .map_err(Error::ReceiveBody)
     }
+
+    pub async fn timeline_init_lsn_lease(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Result<LsnLease> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease",
+            self.mgmt_api_endpoint,
+        );
+
+        self.request(Method::POST, &uri, LsnLeaseRequest { lsn })
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ba38120bf1..6f0402e7b0 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -824,7 +824,7 @@ async fn get_lsn_by_timestamp_handler(
 
     let lease = if with_lease {
         timeline
-            .make_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx)
+            .init_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx)
             .inspect_err(|_| {
                 warn!("fail to grant a lease to {}", lsn);
             })
@@ -1692,9 +1692,18 @@ async fn lsn_lease_handler(
     let timeline =
         active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
             .await?;
-    let result = timeline
-        .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx)
-        .map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
+
+    let result = async {
+        timeline
+            .init_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx)
+            .map_err(|e| {
+                ApiError::InternalServerError(
+                    e.context(format!("invalid lsn lease request at {lsn}")),
+                )
+            })
+    }
+    .instrument(info_span!("init_lsn_lease", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .await?;
 
     json_response(StatusCode::OK, result)
 }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 98d718dde4..8fa6b9a7f0 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -833,7 +833,7 @@ impl PageServerHandler {
         set_tracing_field_shard_id(&timeline);
 
         let lease = timeline
-            .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
+            .renew_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
             .inspect_err(|e| {
                 warn!("{e}");
             })
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e88cda0c9f..db88303f7b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -21,6 +21,7 @@ use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::LsnLease;
 use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::models::TimelineState;
 use pageserver_api::models::TopTenantShardItem;
@@ -182,27 +183,54 @@ pub struct TenantSharedResources {
 pub(super) struct AttachedTenantConf {
     tenant_conf: TenantConfOpt,
     location: AttachedLocationConfig,
+    /// The deadline before which we are blocked from GC so that
+    /// leases have a chance to be renewed.
+    lsn_lease_deadline: Option<tokio::time::Instant>,
 }
 
 impl AttachedTenantConf {
     fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
+        // Sets a deadline before which we cannot proceed to GC due to lsn lease.
+        //
+        // We do this as the leases mapping are not persisted to disk. By delaying GC by lease
+        // length, we guarantee that all the leases we granted before will have a chance to renew
+        // when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
+        let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single {
+            Some(
+                tokio::time::Instant::now()
+                    + tenant_conf
+                        .lsn_lease_length
+                        .unwrap_or(LsnLease::DEFAULT_LENGTH),
+            )
+        } else {
+            // We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale
+            // because we don't do GC in these modes.
+            None
+        };
+
         Self {
             tenant_conf,
             location,
+            lsn_lease_deadline,
         }
     }
 
     fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
         match &location_conf.mode {
-            LocationMode::Attached(attach_conf) => Ok(Self {
-                tenant_conf: location_conf.tenant_conf,
-                location: *attach_conf,
-            }),
+            LocationMode::Attached(attach_conf) => {
+                Ok(Self::new(location_conf.tenant_conf, *attach_conf))
+            }
             LocationMode::Secondary(_) => {
                 anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode")
             }
         }
     }
+
+    fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool {
+        self.lsn_lease_deadline
+            .map(|d| tokio::time::Instant::now() < d)
+            .unwrap_or(false)
+    }
 }
 struct TimelinePreload {
     timeline_id: TimelineId,
@@ -1822,6 +1850,11 @@ impl Tenant {
                 info!("Skipping GC in location state {:?}", conf.location);
                 return Ok(GcResult::default());
             }
+
+            if conf.is_gc_blocked_by_lsn_lease_deadline() {
+                info!("Skipping GC because lsn lease deadline is not reached");
+                return Ok(GcResult::default());
+            }
         }
 
         let _guard = match self.gc_block.start().await {
@@ -2630,6 +2663,8 @@ impl Tenant {
             Arc::new(AttachedTenantConf {
                 tenant_conf: new_tenant_conf.clone(),
                 location: inner.location,
+                // Attached location is not changed, no need to update lsn lease deadline.
+                lsn_lease_deadline: inner.lsn_lease_deadline,
             })
         });
 
@@ -4461,13 +4496,17 @@ mod tests {
         tline.freeze_and_flush().await.map_err(|e| e.into())
     }
 
-    #[tokio::test]
+    #[tokio::test(start_paused = true)]
     async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
         let (tenant, ctx) =
             TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
                 .await?
                 .load()
                 .await;
+        // Advance to the lsn lease deadline so that GC is not blocked by
+        // initial transition into AttachedSingle.
+        tokio::time::advance(tenant.get_lsn_lease_length()).await;
+        tokio::time::resume();
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -7244,9 +7283,17 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[tokio::test(start_paused = true)]
     async fn test_lsn_lease() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_lsn_lease")
+            .await
+            .unwrap()
+            .load()
+            .await;
+        // Advance to the lsn lease deadline so that GC is not blocked by
+        // initial transition into AttachedSingle.
+        tokio::time::advance(tenant.get_lsn_lease_length()).await;
+        tokio::time::resume();
         let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
 
         let end_lsn = Lsn(0x100);
@@ -7274,24 +7321,33 @@ mod tests {
 
         let leased_lsns = [0x30, 0x50, 0x70];
         let mut leases = Vec::new();
-        let _: anyhow::Result<_> = leased_lsns.iter().try_for_each(|n| {
-            leases.push(timeline.make_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)?);
-            Ok(())
+        leased_lsns.iter().for_each(|n| {
+            leases.push(
+                timeline
+                    .init_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)
+                    .expect("lease request should succeed"),
+            );
         });
 
-        // Renewing with shorter lease should not change the lease.
-        let updated_lease_0 =
-            timeline.make_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)?;
-        assert_eq!(updated_lease_0.valid_until, leases[0].valid_until);
+        let updated_lease_0 = timeline
+            .renew_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)
+            .expect("lease renewal should succeed");
+        assert_eq!(
+            updated_lease_0.valid_until, leases[0].valid_until,
+            " Renewing with shorter lease should not change the lease."
+        );
 
-        // Renewing with a long lease should renew lease with later expiration time.
-        let updated_lease_1 = timeline.make_lsn_lease(
-            Lsn(leased_lsns[1]),
-            timeline.get_lsn_lease_length() * 2,
-            &ctx,
-        )?;
-
-        assert!(updated_lease_1.valid_until > leases[1].valid_until);
+        let updated_lease_1 = timeline
+            .renew_lsn_lease(
+                Lsn(leased_lsns[1]),
+                timeline.get_lsn_lease_length() * 2,
+                &ctx,
+            )
+            .expect("lease renewal should succeed");
+        assert!(
+            updated_lease_1.valid_until > leases[1].valid_until,
+            "Renewing with a long lease should renew lease with later expiration time."
+        );
 
         // Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
         info!(
@@ -7308,7 +7364,8 @@ mod tests {
                 &CancellationToken::new(),
                 &ctx,
             )
-            .await?;
+            .await
+            .unwrap();
 
         // Keeping everything <= Lsn(0x80) b/c leases:
         // 0/10: initdb layer
@@ -7322,13 +7379,16 @@ mod tests {
         // Make lease on a already GC-ed LSN.
         // 0/80 does not have a valid lease + is below latest_gc_cutoff
         assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
-        let res = timeline.make_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx);
-        assert!(res.is_err());
+        timeline
+            .init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx)
+            .expect_err("lease request on GC-ed LSN should fail");
 
         // Should still be able to renew a currently valid lease
         // Assumption: original lease to is still valid for 0/50.
-        let _ =
-            timeline.make_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)?;
+        // (use `Timeline::init_lsn_lease` for testing so it always does validation)
+        timeline
+            .init_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)
+            .expect("lease renewal with validation should succeed");
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs
index 1271d25b76..f7a7836a12 100644
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,29 +1,12 @@
-use std::{collections::HashMap, time::Duration};
+use std::collections::HashMap;
 
-use super::remote_timeline_client::index::GcBlockingReason;
-use tokio::time::Instant;
 use utils::id::TimelineId;
 
-type TimelinesBlocked = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
+use super::remote_timeline_client::index::GcBlockingReason;
 
-#[derive(Default)]
-struct Storage {
-    timelines_blocked: TimelinesBlocked,
-    /// The deadline before which we are blocked from GC so that
-    /// leases have a chance to be renewed.
-    lsn_lease_deadline: Option<Instant>,
-}
+type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
 
-impl Storage {
-    fn is_blocked_by_lsn_lease_deadline(&self) -> bool {
-        self.lsn_lease_deadline
-            .map(|d| Instant::now() < d)
-            .unwrap_or(false)
-    }
-}
-
-/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc
-/// blocking.
+/// GcBlock provides persistent (per-timeline) gc blocking.
 #[derive(Default)]
 pub(crate) struct GcBlock {
     /// The timelines which have current reasons to block gc.
@@ -66,17 +49,6 @@ impl GcBlock {
         }
     }
 
-    /// Sets a deadline before which we cannot proceed to GC due to lsn lease.
-    ///
-    /// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
-    /// length, we guarantee that all the leases we granted before will have a chance to renew
-    /// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
-    pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) {
-        let deadline = Instant::now() + lsn_lease_length;
-        let mut g = self.reasons.lock().unwrap();
-        g.lsn_lease_deadline = Some(deadline);
-    }
-
     /// Describe the current gc blocking reasons.
     ///
     /// TODO: make this json serializable.
@@ -102,7 +74,7 @@ impl GcBlock {
     ) -> anyhow::Result<bool> {
         let (added, uploaded) = {
             let mut g = self.reasons.lock().unwrap();
-            let set = g.timelines_blocked.entry(timeline.timeline_id).or_default();
+            let set = g.entry(timeline.timeline_id).or_default();
             let added = set.insert(reason);
 
             // LOCK ORDER: intentionally hold the lock, see self.reasons.
@@ -133,7 +105,7 @@ impl GcBlock {
 
         let (remaining_blocks, uploaded) = {
             let mut g = self.reasons.lock().unwrap();
-            match g.timelines_blocked.entry(timeline.timeline_id) {
+            match g.entry(timeline.timeline_id) {
                 Entry::Occupied(mut oe) => {
                     let set = oe.get_mut();
                     set.remove(reason);
@@ -147,7 +119,7 @@ impl GcBlock {
                 }
             }
 
-            let remaining_blocks = g.timelines_blocked.len();
+            let remaining_blocks = g.len();
 
             // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
             let uploaded = timeline
@@ -172,11 +144,11 @@ impl GcBlock {
     pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
         let unblocked = {
             let mut g = self.reasons.lock().unwrap();
-            if g.timelines_blocked.is_empty() {
+            if g.is_empty() {
                 return;
             }
 
-            g.timelines_blocked.remove(&timeline.timeline_id);
+            g.remove(&timeline.timeline_id);
 
             BlockingReasons::clean_and_summarize(g).is_none()
         };
@@ -187,11 +159,10 @@ impl GcBlock {
     }
 
     /// Initialize with the non-deleted timelines of this tenant.
-    pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) {
+    pub(crate) fn set_scanned(&self, scanned: Storage) {
         let mut g = self.reasons.lock().unwrap();
-        assert!(g.timelines_blocked.is_empty());
-        g.timelines_blocked
-            .extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
+        assert!(g.is_empty());
+        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
 
         if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
             tracing::info!(summary=?reasons, "initialized with gc blocked");
@@ -205,7 +176,6 @@ pub(super) struct Guard<'a> {
 
 #[derive(Debug)]
 pub(crate) struct BlockingReasons {
-    tenant_blocked_by_lsn_lease_deadline: bool,
     timelines: usize,
     reasons: enumset::EnumSet<GcBlockingReason>,
 }
@@ -214,8 +184,8 @@ impl std::fmt::Display for BlockingReasons {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
-            "tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}",
-            self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons
+            "{} timelines block for {:?}",
+            self.timelines, self.reasons
         )
     }
 }
@@ -223,15 +193,13 @@ impl std::fmt::Display for BlockingReasons {
 impl BlockingReasons {
     fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
         let mut reasons = enumset::EnumSet::empty();
-        g.timelines_blocked.retain(|_key, value| {
+        g.retain(|_key, value| {
             reasons = reasons.union(*value);
             !value.is_empty()
         });
-        let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
-        if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline {
+        if !g.is_empty() {
             Some(BlockingReasons {
-                tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
-                timelines: g.timelines_blocked.len(),
+                timelines: g.len(),
                 reasons,
             })
         } else {
@@ -240,17 +208,14 @@ impl BlockingReasons {
     }
 
     fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
-        if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline {
+        if g.is_empty() {
             None
         } else {
             let reasons = g
-                .timelines_blocked
                 .values()
                 .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
             Some(BlockingReasons {
-                tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
-                timelines: g.timelines_blocked.len(),
+                timelines: g.len(),
                 reasons,
             })
         }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index e159ae186d..f6249056d8 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -949,12 +949,6 @@ impl TenantManager {
                 (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
                     match attach_conf.generation.cmp(&tenant.generation) {
                         Ordering::Equal => {
-                            if attach_conf.attach_mode == AttachmentMode::Single {
-                                tenant
-                                    .gc_block
-                                    .set_lsn_lease_deadline(tenant.get_lsn_lease_length());
-                            }
-
                             // A transition from Attached to Attached in the same generation, we may
                             // take our fast path and just provide the updated configuration
                             // to the tenant.
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 3f0f8a21c8..547739e773 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -330,7 +330,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
             RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
 
         let mut first = true;
-        tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
         loop {
             tokio::select! {
                 _ = cancel.cancelled() => {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 157c6ab91e..2113a1d726 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -66,6 +66,7 @@ use std::{
 use crate::{
     aux_file::AuxFileSizeEstimator,
     tenant::{
+        config::AttachmentMode,
         layer_map::{LayerMap, SearchResult},
         metadata::TimelineMetadata,
         storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
@@ -1324,16 +1325,38 @@ impl Timeline {
         Ok(())
     }
 
-    /// Obtains a temporary lease blocking garbage collection for the given LSN.
-    ///
-    /// This function will error if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is also
-    /// no existing lease to renew. If there is an existing lease in the map, the lease will be renewed only if
-    /// the request extends the lease. The returned lease is therefore the maximum between the existing lease and
-    /// the requesting lease.
-    pub(crate) fn make_lsn_lease(
+    /// Initializes an LSN lease. The function will return an error if the requested LSN is less than the `latest_gc_cutoff_lsn`.
+    pub(crate) fn init_lsn_lease(
         &self,
         lsn: Lsn,
         length: Duration,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<LsnLease> {
+        self.make_lsn_lease(lsn, length, true, ctx)
+    }
+
+    /// Renews a lease at a particular LSN. The requested LSN is not validated against the `latest_gc_cutoff_lsn` when we are in the grace period.
+    pub(crate) fn renew_lsn_lease(
+        &self,
+        lsn: Lsn,
+        length: Duration,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<LsnLease> {
+        self.make_lsn_lease(lsn, length, false, ctx)
+    }
+
+    /// Obtains a temporary lease blocking garbage collection for the given LSN.
+    ///
+    /// If we are in `AttachedSingle` mode and is not blocked by the lsn lease deadline, this function will error
+    /// if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is no existing request present.
+    ///
+    /// If there is an existing lease in the map, the lease will be renewed only if the request extends the lease.
+    /// The returned lease is therefore the maximum between the existing lease and the requesting lease.
+    fn make_lsn_lease(
+        &self,
+        lsn: Lsn,
+        length: Duration,
+        init: bool,
         _ctx: &RequestContext,
     ) -> anyhow::Result<LsnLease> {
         let lease = {
@@ -1347,8 +1370,8 @@ impl Timeline {
 
             let entry = gc_info.leases.entry(lsn);
 
-            let lease = {
-                if let Entry::Occupied(mut occupied) = entry {
+            match entry {
+                Entry::Occupied(mut occupied) => {
                     let existing_lease = occupied.get_mut();
                     if valid_until > existing_lease.valid_until {
                         existing_lease.valid_until = valid_until;
@@ -1360,20 +1383,28 @@ impl Timeline {
                     }
 
                     existing_lease.clone()
-                } else {
-                    // Reject already GC-ed LSN (lsn < latest_gc_cutoff)
-                    let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
-                    if lsn < *latest_gc_cutoff_lsn {
-                        bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
+                }
+                Entry::Vacant(vacant) => {
+                    // Reject already GC-ed LSN (lsn < latest_gc_cutoff) if we are in AttachedSingle and
+                    // not blocked by the lsn lease deadline.
+                    let validate = {
+                        let conf = self.tenant_conf.load();
+                        conf.location.attach_mode == AttachmentMode::Single
+                            && !conf.is_gc_blocked_by_lsn_lease_deadline()
+                    };
+
+                    if init || validate {
+                        let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
+                        if lsn < *latest_gc_cutoff_lsn {
+                            bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
+                        }
                     }
 
                     let dt: DateTime<Utc> = valid_until.into();
                     info!("lease created, valid until {}", dt);
-                    entry.or_insert(LsnLease { valid_until }).clone()
+                    vacant.insert(LsnLease { valid_until }).clone()
                 }
-            };
-
-            lease
+            }
         };
 
         Ok(lease)
@@ -1950,8 +1981,6 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
     }
 
-    // TODO(yuchen): remove unused flag after implementing https://github.com/neondatabase/neon/issues/8072
-    #[allow(unused)]
     pub(crate) fn get_lsn_lease_length_for_ts(&self) -> Duration {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 5e8b8d38f7..b08fcc0da1 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -27,7 +27,7 @@ def test_readonly_node(neon_simple_env: NeonEnv):
     env.pageserver.allowed_errors.extend(
         [
             ".*basebackup .* failed: invalid basebackup lsn.*",
-            ".*page_service.*handle_make_lsn_lease.*.*tried to request a page version that was garbage collected",
+            ".*/lsn_lease.*invalid lsn lease request.*",
         ]
     )
 
@@ -108,7 +108,7 @@ def test_readonly_node(neon_simple_env: NeonEnv):
     assert cur.fetchone() == (1,)
 
     # Create node at pre-initdb lsn
-    with pytest.raises(Exception, match="invalid basebackup lsn"):
+    with pytest.raises(Exception, match="invalid lsn lease request"):
         # compute node startup with invalid LSN should fail
         env.endpoints.create_start(
             branch_name="main",
@@ -167,6 +167,23 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
             )
         return last_flush_lsn
 
+    def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint):
+        """
+        Trigger GC manually on all pageservers. Then run an `SELECT` query.
+        """
+        for shard, ps in tenant_get_shards(env, env.initial_tenant):
+            client = ps.http_client()
+            gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
+            log.info(f"{gc_result=}")
+
+            assert (
+                gc_result["layers_removed"] == 0
+            ), "No layers should be removed, old layers are guarded by leases."
+
+        with ep_static.cursor() as cur:
+            cur.execute("SELECT count(*) FROM t0")
+            assert cur.fetchone() == (ROW_COUNT,)
+
     # Insert some records on main branch
     with env.endpoints.create_start("main") as ep_main:
         with ep_main.cursor() as cur:
@@ -193,25 +210,31 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
 
             generate_updates_on_main(env, ep_main, i, end=100)
 
-            # Trigger GC
-            for shard, ps in tenant_get_shards(env, env.initial_tenant):
-                client = ps.http_client()
-                gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
-                log.info(f"{gc_result=}")
+            trigger_gc_and_select(env, ep_static)
 
-                assert (
-                    gc_result["layers_removed"] == 0
-                ), "No layers should be removed, old layers are guarded by leases."
+            # Trigger Pageserver restarts
+            for ps in env.pageservers:
+                ps.stop()
+                # Static compute should have at least one lease request failure due to connection.
+                time.sleep(LSN_LEASE_LENGTH / 2)
+                ps.start()
 
-            with ep_static.cursor() as cur:
-                cur.execute("SELECT count(*) FROM t0")
-                assert cur.fetchone() == (ROW_COUNT,)
+            trigger_gc_and_select(env, ep_static)
+
+            # Reconfigure pageservers
+            env.pageservers[0].stop()
+            env.storage_controller.node_configure(
+                env.pageservers[0].id, {"availability": "Offline"}
+            )
+            env.storage_controller.reconcile_until_idle()
+
+            trigger_gc_and_select(env, ep_static)
 
         # Do some update so we can increment latest_gc_cutoff
         generate_updates_on_main(env, ep_main, i, end=100)
 
     # Wait for the existing lease to expire.
-    time.sleep(LSN_LEASE_LENGTH)
+    time.sleep(LSN_LEASE_LENGTH + 1)
     # Now trigger GC again, layers should be removed.
     for shard, ps in tenant_get_shards(env, env.initial_tenant):
         client = ps.http_client()
diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py
index 765c72cf2a..ddfe9b911f 100644
--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -45,10 +45,7 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool
     tenant_after = http.tenant_status(env.initial_tenant)
     assert tenant_before != tenant_after
     gc_blocking = tenant_after["gc_blocking"]
-    assert (
-        gc_blocking
-        == "BlockingReasons { tenant_blocked_by_lsn_lease_deadline: false, timelines: 1, reasons: EnumSet(Manual) }"
-    )
+    assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"
 
     wait_for_another_gc_round()
     pss.assert_log_contains(gc_skipped_line)

From 43b2445d0bc7fd541f10a441c3935eebb6b48e78 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 27 Sep 2024 16:08:43 +0100
Subject: [PATCH 21/29] proxy: add jwks endpoint to control plane and mock
 providers (#9165)

---
 proxy/src/auth/backend.rs          | 16 ++++++
 proxy/src/console/provider.rs      | 38 ++++++++++++-
 proxy/src/console/provider/mock.rs | 45 ++++++++++++++-
 proxy/src/console/provider/neon.rs | 90 +++++++++++++++++++++++++++---
 proxy/src/http.rs                  | 16 ++++--
 proxy/src/proxy/tests.rs           |  4 ++
 6 files changed, 193 insertions(+), 16 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 4e9f4591ad..5dbfa5cc09 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -80,6 +80,14 @@ pub(crate) trait TestBackend: Send + Sync + 'static {
     fn get_allowed_ips_and_secret(
         &self,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
+    fn dyn_clone(&self) -> Box<dyn TestBackend>;
+}
+
+#[cfg(test)]
+impl Clone for Box<dyn TestBackend> {
+    fn clone(&self) -> Self {
+        TestBackend::dyn_clone(&**self)
+    }
 }
 
 impl std::fmt::Display for Backend<'_, (), ()> {
@@ -585,6 +593,14 @@ mod tests {
             ))
         }
 
+        async fn get_endpoint_jwks(
+            &self,
+            _ctx: &RequestMonitoring,
+            _endpoint: crate::EndpointId,
+        ) -> anyhow::Result<Vec<super::jwt::AuthRule>> {
+            unimplemented!()
+        }
+
         async fn wake_compute(
             &self,
             _ctx: &RequestMonitoring,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 16e8da605b..95097f2de9 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -5,7 +5,10 @@ pub mod neon;
 use super::messages::{ConsoleError, MetricsAuxInfo};
 use crate::{
     auth::{
-        backend::{ComputeCredentialKeys, ComputeUserInfo},
+        backend::{
+            jwt::{AuthRule, FetchAuthRules},
+            ComputeCredentialKeys, ComputeUserInfo,
+        },
         IpPattern,
     },
     cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru},
@@ -16,7 +19,7 @@ use crate::{
     intern::ProjectIdInt,
     metrics::ApiLockMetrics,
     rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token},
-    scram, EndpointCacheKey,
+    scram, EndpointCacheKey, EndpointId,
 };
 use dashmap::DashMap;
 use std::{hash::Hash, sync::Arc, time::Duration};
@@ -334,6 +337,12 @@ pub(crate) trait Api {
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
 
+    async fn get_endpoint_jwks(
+        &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+    ) -> anyhow::Result<Vec<AuthRule>>;
+
     /// Wake up the compute node and return the corresponding connection info.
     async fn wake_compute(
         &self,
@@ -343,6 +352,7 @@ pub(crate) trait Api {
 }
 
 #[non_exhaustive]
+#[derive(Clone)]
 pub enum ConsoleBackend {
     /// Current Cloud API (V2).
     Console(neon::Api),
@@ -386,6 +396,20 @@ impl Api for ConsoleBackend {
         }
     }
 
+    async fn get_endpoint_jwks(
+        &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+    ) -> anyhow::Result<Vec<AuthRule>> {
+        match self {
+            Self::Console(api) => api.get_endpoint_jwks(ctx, endpoint).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::Postgres(api) => api.get_endpoint_jwks(ctx, endpoint).await,
+            #[cfg(test)]
+            Self::Test(_api) => Ok(vec![]),
+        }
+    }
+
     async fn wake_compute(
         &self,
         ctx: &RequestMonitoring,
@@ -552,3 +576,13 @@ impl WakeComputePermit {
         res
     }
 }
+
+impl FetchAuthRules for ConsoleBackend {
+    async fn fetch_auth_rules(
+        &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+    ) -> anyhow::Result<Vec<AuthRule>> {
+        self.get_endpoint_jwks(ctx, endpoint).await
+    }
+}
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 1b77418de6..b548a0203a 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -4,7 +4,9 @@ use super::{
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
     AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
 };
-use crate::context::RequestMonitoring;
+use crate::{
+    auth::backend::jwt::AuthRule, context::RequestMonitoring, intern::RoleNameInt, RoleName,
+};
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use crate::{auth::IpPattern, cache::Cached};
 use crate::{
@@ -118,6 +120,39 @@ impl Api {
         })
     }
 
+    async fn do_get_endpoint_jwks(&self, endpoint: EndpointId) -> anyhow::Result<Vec<AuthRule>> {
+        let (client, connection) =
+            tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
+
+        let connection = tokio::spawn(connection);
+
+        let res = client.query(
+                "select id, jwks_url, audience, role_names from neon_control_plane.endpoint_jwks where endpoint_id = $1",
+                &[&endpoint.as_str()],
+            )
+            .await?;
+
+        let mut rows = vec![];
+        for row in res {
+            rows.push(AuthRule {
+                id: row.get("id"),
+                jwks_url: url::Url::parse(row.get("jwks_url"))?,
+                audience: row.get("audience"),
+                role_names: row
+                    .get::<_, Vec<String>>("role_names")
+                    .into_iter()
+                    .map(RoleName::from)
+                    .map(|s| RoleNameInt::from(&s))
+                    .collect(),
+            });
+        }
+
+        drop(client);
+        connection.await??;
+
+        Ok(rows)
+    }
+
     async fn do_wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
         let mut config = compute::ConnCfg::new();
         config
@@ -185,6 +220,14 @@ impl super::Api for Api {
         ))
     }
 
+    async fn get_endpoint_jwks(
+        &self,
+        _ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+    ) -> anyhow::Result<Vec<AuthRule>> {
+        self.do_get_endpoint_jwks(endpoint).await
+    }
+
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index b004bf4ecf..2d527f378c 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -7,27 +7,33 @@ use super::{
     NodeInfo,
 };
 use crate::{
-    auth::backend::ComputeUserInfo,
+    auth::backend::{jwt::AuthRule, ComputeUserInfo},
     compute,
-    console::messages::{ColdStartInfo, Reason},
+    console::messages::{ColdStartInfo, EndpointJwksResponse, Reason},
     http,
     metrics::{CacheOutcome, Metrics},
     rate_limiter::WakeComputeRateLimiter,
-    scram, EndpointCacheKey,
+    scram, EndpointCacheKey, EndpointId,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
+use ::http::{header::AUTHORIZATION, HeaderName};
+use anyhow::bail;
 use futures::TryFutureExt;
 use std::{sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{debug, error, info, info_span, warn, Instrument};
 
+const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
+
+#[derive(Clone)]
 pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
     pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
     pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
-    jwt: String,
+    // put in a shared ref so we don't copy secrets all over in memory
+    jwt: Arc<str>,
 }
 
 impl Api {
@@ -38,7 +44,9 @@ impl Api {
         locks: &'static ApiLocks<EndpointCacheKey>,
         wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
     ) -> Self {
-        let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN").unwrap_or_default();
+        let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN")
+            .unwrap_or_default()
+            .into();
         Self {
             endpoint,
             caches,
@@ -71,9 +79,9 @@ impl Api {
         async {
             let request = self
                 .endpoint
-                .get("proxy_get_role_secret")
-                .header("X-Request-ID", &request_id)
-                .header("Authorization", format!("Bearer {}", &self.jwt))
+                .get_path("proxy_get_role_secret")
+                .header(X_REQUEST_ID, &request_id)
+                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
                 .query(&[("session_id", ctx.session_id())])
                 .query(&[
                     ("application_name", application_name.as_str()),
@@ -125,6 +133,61 @@ impl Api {
         .await
     }
 
+    async fn do_get_endpoint_jwks(
+        &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+    ) -> anyhow::Result<Vec<AuthRule>> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &endpoint.normalize())
+            .await
+        {
+            bail!("endpoint not found");
+        }
+        let request_id = ctx.session_id().to_string();
+        async {
+            let request = self
+                .endpoint
+                .get_with_url(|url| {
+                    url.path_segments_mut()
+                        .push("endpoints")
+                        .push(endpoint.as_str())
+                        .push("jwks");
+                })
+                .header(X_REQUEST_ID, &request_id)
+                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
+                .query(&[("session_id", ctx.session_id())])
+                .build()?;
+
+            info!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let response = self.endpoint.execute(request).await?;
+            drop(pause);
+            info!(duration = ?start.elapsed(), "received http response");
+
+            let body = parse_body::<EndpointJwksResponse>(response).await?;
+
+            let rules = body
+                .jwks
+                .into_iter()
+                .map(|jwks| AuthRule {
+                    id: jwks.id,
+                    jwks_url: jwks.jwks_url,
+                    audience: jwks.jwt_audience,
+                    role_names: jwks.role_names,
+                })
+                .collect();
+
+            Ok(rules)
+        }
+        .map_err(crate::error::log_error)
+        .instrument(info_span!("http", id = request_id))
+        .await
+    }
+
     async fn do_wake_compute(
         &self,
         ctx: &RequestMonitoring,
@@ -135,7 +198,7 @@ impl Api {
         async {
             let mut request_builder = self
                 .endpoint
-                .get("proxy_wake_compute")
+                .get_path("proxy_wake_compute")
                 .header("X-Request-ID", &request_id)
                 .header("Authorization", format!("Bearer {}", &self.jwt))
                 .query(&[("session_id", ctx.session_id())])
@@ -262,6 +325,15 @@ impl super::Api for Api {
         ))
     }
 
+    #[tracing::instrument(skip_all)]
+    async fn get_endpoint_jwks(
+        &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+    ) -> anyhow::Result<Vec<AuthRule>> {
+        self.do_get_endpoint_jwks(ctx, endpoint).await
+    }
+
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index c77d95f47d..14720b5c6b 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -86,9 +86,17 @@ impl Endpoint {
 
     /// Return a [builder](RequestBuilder) for a `GET` request,
     /// appending a single `path` segment to the base endpoint URL.
-    pub(crate) fn get(&self, path: &str) -> RequestBuilder {
+    pub(crate) fn get_path(&self, path: &str) -> RequestBuilder {
+        self.get_with_url(|u| {
+            u.path_segments_mut().push(path);
+        })
+    }
+
+    /// Return a [builder](RequestBuilder) for a `GET` request,
+    /// accepting a closure to modify the url path segments for more complex paths queries.
+    pub(crate) fn get_with_url(&self, f: impl for<'a> FnOnce(&'a mut ApiUrl)) -> RequestBuilder {
         let mut url = self.endpoint.clone();
-        url.path_segments_mut().push(path);
+        f(&mut url);
         self.client.get(url.into_inner())
     }
 
@@ -144,7 +152,7 @@ mod tests {
 
         // Validate that this pattern makes sense.
         let req = endpoint
-            .get("frobnicate")
+            .get_path("frobnicate")
             .query(&[
                 ("foo", Some("10")), // should be just `foo=10`
                 ("bar", None),       // shouldn't be passed at all
@@ -162,7 +170,7 @@ mod tests {
         let endpoint = Endpoint::new(url, Client::new());
 
         let req = endpoint
-            .get("frobnicate")
+            .get_path("frobnicate")
             .query(&[("session_id", uuid::Uuid::nil())])
             .build()?;
 
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 752d982726..058ec06e02 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -525,6 +525,10 @@ impl TestBackend for TestConnectMechanism {
     {
         unimplemented!("not used in tests")
     }
+
+    fn dyn_clone(&self) -> Box<dyn TestBackend> {
+        Box::new(self.clone())
+    }
 }
 
 fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {

From d56c4e7a380e3932fcfa9003e645bf243ed54bf6 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Fri, 27 Sep 2024 11:41:42 -0400
Subject: [PATCH 22/29]  pageserver: remove AdjacentVectoredReadBuilder and
 bump minmimum io_buffer_alignment to 512 (#9175)

Part of #8130

## Problem

After deploying https://github.com/neondatabase/infra/pull/1927, we
shipped `io_buffer_alignment=512` to all prod region. The
`AdjacentVectoredReadBuilder` code path is no longer taken and we are
running pageserver unit tests 6 times in the CI. Removing it would
reduce the test duration by 30-60s.

## Summary of changes

- Remove `AdjacentVectoredReadBuilder` code.
- Bump the minimum `io_buffer_alignment` requirement to at least 512
bytes.
- Use default `io_buffer_alignment` for Rust unit tests.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 .github/workflows/_build-and-test-locally.yml |   4 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  12 +-
 pageserver/src/tenant/vectored_blob_io.rs     | 216 +++---------------
 pageserver/src/virtual_file.rs                |  32 ++-
 4 files changed, 57 insertions(+), 207 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 5ea911eb95..5fc6aa247a 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -236,9 +236,7 @@ jobs:
 
           # run pageserver tests with different settings
           for io_engine in std-fs tokio-epoll-uring ; do
-            for io_buffer_alignment in 0 1 512 ; do
-              NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
-            done
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
           done
 
           # Run separate tests for real S3
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 2b212cfed5..9241ff569c 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -40,7 +40,7 @@ use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
-    VectoredReadCoalesceMode, VectoredReadPlanner,
+    VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
@@ -1133,7 +1133,7 @@ impl DeltaLayerInner {
         ctx: &RequestContext,
     ) -> anyhow::Result<usize> {
         use crate::tenant::vectored_blob_io::{
-            BlobMeta, VectoredReadBuilder, VectoredReadExtended,
+            BlobMeta, ChunkedVectoredReadBuilder, VectoredReadExtended,
         };
         use futures::stream::TryStreamExt;
 
@@ -1183,8 +1183,8 @@ impl DeltaLayerInner {
 
         let mut prev: Option<(Key, Lsn, BlobRef)> = None;
 
-        let mut read_builder: Option<VectoredReadBuilder> = None;
-        let read_mode = VectoredReadCoalesceMode::get();
+        let mut read_builder: Option<ChunkedVectoredReadBuilder> = None;
+        let align = virtual_file::get_io_buffer_alignment();
 
         let max_read_size = self
             .max_vectored_read_bytes
@@ -1228,12 +1228,12 @@ impl DeltaLayerInner {
                 {
                     None
                 } else {
-                    read_builder.replace(VectoredReadBuilder::new(
+                    read_builder.replace(ChunkedVectoredReadBuilder::new(
                         offsets.start.pos(),
                         offsets.end.pos(),
                         meta,
                         max_read_size,
-                        read_mode,
+                        align,
                     ))
                 }
             } else {
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index aa37a45898..1faa6bab99 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -185,171 +185,7 @@ pub(crate) enum VectoredReadExtended {
     No,
 }
 
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum VectoredReadCoalesceMode {
-    /// Only coalesce exactly adjacent reads.
-    AdjacentOnly,
-    /// In addition to adjacent reads, also consider reads whose corresponding
-    /// `end` and `start` offsets reside at the same chunk.
-    Chunked(usize),
-}
-
-impl VectoredReadCoalesceMode {
-    /// [`AdjacentVectoredReadBuilder`] is used if alignment requirement is 0,
-    /// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher.
-    pub(crate) fn get() -> Self {
-        let align = virtual_file::get_io_buffer_alignment_raw();
-        if align == 0 {
-            VectoredReadCoalesceMode::AdjacentOnly
-        } else {
-            VectoredReadCoalesceMode::Chunked(align)
-        }
-    }
-}
-
-pub(crate) enum VectoredReadBuilder {
-    Adjacent(AdjacentVectoredReadBuilder),
-    Chunked(ChunkedVectoredReadBuilder),
-}
-
-impl VectoredReadBuilder {
-    fn new_impl(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        max_read_size: Option<usize>,
-        mode: VectoredReadCoalesceMode,
-    ) -> Self {
-        match mode {
-            VectoredReadCoalesceMode::AdjacentOnly => Self::Adjacent(
-                AdjacentVectoredReadBuilder::new(start_offset, end_offset, meta, max_read_size),
-            ),
-            VectoredReadCoalesceMode::Chunked(chunk_size) => {
-                Self::Chunked(ChunkedVectoredReadBuilder::new(
-                    start_offset,
-                    end_offset,
-                    meta,
-                    max_read_size,
-                    chunk_size,
-                ))
-            }
-        }
-    }
-
-    pub(crate) fn new(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        max_read_size: usize,
-        mode: VectoredReadCoalesceMode,
-    ) -> Self {
-        Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), mode)
-    }
-
-    pub(crate) fn new_streaming(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        mode: VectoredReadCoalesceMode,
-    ) -> Self {
-        Self::new_impl(start_offset, end_offset, meta, None, mode)
-    }
-
-    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
-        match self {
-            VectoredReadBuilder::Adjacent(builder) => builder.extend(start, end, meta),
-            VectoredReadBuilder::Chunked(builder) => builder.extend(start, end, meta),
-        }
-    }
-
-    pub(crate) fn build(self) -> VectoredRead {
-        match self {
-            VectoredReadBuilder::Adjacent(builder) => builder.build(),
-            VectoredReadBuilder::Chunked(builder) => builder.build(),
-        }
-    }
-
-    pub(crate) fn size(&self) -> usize {
-        match self {
-            VectoredReadBuilder::Adjacent(builder) => builder.size(),
-            VectoredReadBuilder::Chunked(builder) => builder.size(),
-        }
-    }
-}
-
-pub(crate) struct AdjacentVectoredReadBuilder {
-    /// Start offset of the read.
-    start: u64,
-    // End offset of the read.
-    end: u64,
-    /// Start offset and metadata for each blob in this read
-    blobs_at: VecMap<u64, BlobMeta>,
-    max_read_size: Option<usize>,
-}
-
-impl AdjacentVectoredReadBuilder {
-    /// Start building a new vectored read.
-    ///
-    /// Note that by design, this does not check against reading more than `max_read_size` to
-    /// support reading larger blobs than the configuration value. The builder will be single use
-    /// however after that.
-    pub(crate) fn new(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        max_read_size: Option<usize>,
-    ) -> Self {
-        let mut blobs_at = VecMap::default();
-        blobs_at
-            .append(start_offset, meta)
-            .expect("First insertion always succeeds");
-
-        Self {
-            start: start_offset,
-            end: end_offset,
-            blobs_at,
-            max_read_size,
-        }
-    }
-    /// Attempt to extend the current read with a new blob if the start
-    /// offset matches with the current end of the vectored read
-    /// and the resuting size is below the max read size
-    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
-        tracing::trace!(start, end, "trying to extend");
-        let size = (end - start) as usize;
-        let not_limited_by_max_read_size = {
-            if let Some(max_read_size) = self.max_read_size {
-                self.size() + size <= max_read_size
-            } else {
-                true
-            }
-        };
-
-        if self.end == start && not_limited_by_max_read_size {
-            self.end = end;
-            self.blobs_at
-                .append(start, meta)
-                .expect("LSNs are ordered within vectored reads");
-
-            return VectoredReadExtended::Yes;
-        }
-
-        VectoredReadExtended::No
-    }
-
-    pub(crate) fn size(&self) -> usize {
-        (self.end - self.start) as usize
-    }
-
-    pub(crate) fn build(self) -> VectoredRead {
-        VectoredRead {
-            start: self.start,
-            end: self.end,
-            blobs_at: self.blobs_at,
-        }
-    }
-}
-
+/// A vectored read builder that tries to coalesce all reads that fits in a chunk.
 pub(crate) struct ChunkedVectoredReadBuilder {
     /// Start block number
     start_blk_no: usize,
@@ -373,7 +209,7 @@ impl ChunkedVectoredReadBuilder {
     /// Note that by design, this does not check against reading more than `max_read_size` to
     /// support reading larger blobs than the configuration value. The builder will be single use
     /// however after that.
-    pub(crate) fn new(
+    fn new_impl(
         start_offset: u64,
         end_offset: u64,
         meta: BlobMeta,
@@ -396,6 +232,25 @@ impl ChunkedVectoredReadBuilder {
         }
     }
 
+    pub(crate) fn new(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        max_read_size: usize,
+        align: usize,
+    ) -> Self {
+        Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), align)
+    }
+
+    pub(crate) fn new_streaming(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        align: usize,
+    ) -> Self {
+        Self::new_impl(start_offset, end_offset, meta, None, align)
+    }
+
     /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk.
     ///
     /// The resulting size also must be below the max read size.
@@ -474,17 +329,17 @@ pub struct VectoredReadPlanner {
 
     max_read_size: usize,
 
-    mode: VectoredReadCoalesceMode,
+    align: usize,
 }
 
 impl VectoredReadPlanner {
     pub fn new(max_read_size: usize) -> Self {
-        let mode = VectoredReadCoalesceMode::get();
+        let align = virtual_file::get_io_buffer_alignment();
         Self {
             blobs: BTreeMap::new(),
             prev: None,
             max_read_size,
-            mode,
+            align,
         }
     }
 
@@ -545,7 +400,7 @@ impl VectoredReadPlanner {
     }
 
     pub fn finish(self) -> Vec<VectoredRead> {
-        let mut current_read_builder: Option<VectoredReadBuilder> = None;
+        let mut current_read_builder: Option<ChunkedVectoredReadBuilder> = None;
         let mut reads = Vec::new();
 
         for (key, blobs_for_key) in self.blobs {
@@ -558,12 +413,12 @@ impl VectoredReadPlanner {
                 };
 
                 if extended == VectoredReadExtended::No {
-                    let next_read_builder = VectoredReadBuilder::new(
+                    let next_read_builder = ChunkedVectoredReadBuilder::new(
                         start_offset,
                         end_offset,
                         BlobMeta { key, lsn },
                         self.max_read_size,
-                        self.mode,
+                        self.align,
                     );
 
                     let prev_read_builder = current_read_builder.replace(next_read_builder);
@@ -688,7 +543,7 @@ impl<'a> VectoredBlobReader<'a> {
 /// `handle` gets called and when the current key would just exceed the read_size and
 /// max_cnt constraints.
 pub struct StreamingVectoredReadPlanner {
-    read_builder: Option<VectoredReadBuilder>,
+    read_builder: Option<ChunkedVectoredReadBuilder>,
     // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
     prev: Option<(Key, Lsn, u64)>,
     /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150,
@@ -699,21 +554,21 @@ pub struct StreamingVectoredReadPlanner {
     /// Size of the current batch
     cnt: usize,
 
-    mode: VectoredReadCoalesceMode,
+    align: usize,
 }
 
 impl StreamingVectoredReadPlanner {
     pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
         assert!(max_cnt > 0);
         assert!(max_read_size > 0);
-        let mode = VectoredReadCoalesceMode::get();
+        let align = virtual_file::get_io_buffer_alignment();
         Self {
             read_builder: None,
             prev: None,
             max_cnt,
             max_read_size,
             cnt: 0,
-            mode,
+            align,
         }
     }
 
@@ -762,11 +617,11 @@ impl StreamingVectoredReadPlanner {
             }
             None => {
                 self.read_builder = {
-                    Some(VectoredReadBuilder::new_streaming(
+                    Some(ChunkedVectoredReadBuilder::new_streaming(
                         start_offset,
                         end_offset,
                         BlobMeta { key, lsn },
-                        self.mode,
+                        self.align,
                     ))
                 };
             }
@@ -1092,7 +947,7 @@ mod tests {
         let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
         let mut buf = BytesMut::with_capacity(reserved_bytes);
 
-        let mode = VectoredReadCoalesceMode::get();
+        let align = virtual_file::get_io_buffer_alignment();
         let vectored_blob_reader = VectoredBlobReader::new(&file);
         let meta = BlobMeta {
             key: Key::MIN,
@@ -1104,7 +959,8 @@ mod tests {
             if idx + 1 == offsets.len() {
                 continue;
             }
-            let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, mode);
+            let read_builder =
+                ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, align);
             let read = read_builder.build();
             let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
             assert_eq!(result.blobs.len(), 1);
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 57856eea80..17d9f94c95 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -1147,7 +1147,9 @@ pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize)
         panic!("virtual_file::init called twice");
     }
     if set_io_buffer_alignment(io_buffer_alignment).is_err() {
-        panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two");
+        panic!(
+            "IO buffer alignment needs to be a power of two and greater than 512, got {io_buffer_alignment}"
+        );
     }
     io_engine::init(engine);
     crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
@@ -1174,14 +1176,16 @@ fn get_open_files() -> &'static OpenFiles {
 
 static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT);
 
-/// Returns true if `x` is zero or a power of two.
-fn is_zero_or_power_of_two(x: usize) -> bool {
-    (x == 0) || ((x & (x - 1)) == 0)
+/// Returns true if the alignment is a power of two and is greater or equal to 512.
+fn is_valid_io_buffer_alignment(align: usize) -> bool {
+    align.is_power_of_two() && align >= 512
 }
 
+/// Sets IO buffer alignment requirement. Returns error if the alignment requirement is
+/// not a power of two or less than 512 bytes.
 #[allow(unused)]
 pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
-    if is_zero_or_power_of_two(align) {
+    if is_valid_io_buffer_alignment(align) {
         IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed);
         Ok(())
     } else {
@@ -1189,19 +1193,19 @@ pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
     }
 }
 
-/// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified.
+/// Gets the io buffer alignment.
 ///
-/// This function should be used to check the raw config value.
-pub(crate) fn get_io_buffer_alignment_raw() -> usize {
+/// This function should be used for getting the actual alignment value to use.
+pub(crate) fn get_io_buffer_alignment() -> usize {
     let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed);
 
     if cfg!(test) {
         let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT";
         if let Some(test_align) = utils::env::var(env_var_name) {
-            if is_zero_or_power_of_two(test_align) {
+            if is_valid_io_buffer_alignment(test_align) {
                 test_align
             } else {
-                panic!("IO buffer alignment ({test_align}) is not a power of two");
+                panic!("IO buffer alignment needs to be a power of two and greater than 512, got {test_align}");
             }
         } else {
             align
@@ -1211,14 +1215,6 @@ pub(crate) fn get_io_buffer_alignment_raw() -> usize {
     }
 }
 
-/// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero.
-///
-/// This function should be used for getting the actual alignment value to use.
-pub(crate) fn get_io_buffer_alignment() -> usize {
-    let align = get_io_buffer_alignment_raw();
-    align.max(1)
-}
-
 #[cfg(test)]
 mod tests {
     use crate::context::DownloadBehavior;

From 5c5871111a754ea92eee1a07248f05582e608cbe Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Fri, 27 Sep 2024 17:47:05 +0200
Subject: [PATCH 23/29] WalProposer: Read WAL directly from WAL buffers in PG17
 (#9171)

This reduces the overhead of the WalProposer when it is not being
throttled by SK WAL acceptance rate
---
 pgxn/neon/walproposer_pg.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 4d0d06e6de..bb65a11c7d 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1473,11 +1473,33 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count,
 {
 	NeonWALReadResult res;
 
-	res = NeonWALRead(sk->xlogreader,
-					  buf,
-					  startptr,
-					  count,
-					  walprop_pg_get_timeline_id());
+#if PG_MAJORVERSION_NUM >= 17
+	if (!sk->wp->config->syncSafekeepers)
+	{
+		Size	rbytes;
+		rbytes = WALReadFromBuffers(buf, startptr, count,
+									walprop_pg_get_timeline_id());
+
+		startptr += rbytes;
+		count -= rbytes;
+	}
+#endif
+
+	if (count == 0)
+	{
+		res = NEON_WALREAD_SUCCESS;
+	}
+	else
+	{
+		Assert(count > 0);
+
+		/* Now read the remaining WAL from the WAL file */
+		res = NeonWALRead(sk->xlogreader,
+						  buf,
+						  startptr,
+						  count,
+						  walprop_pg_get_timeline_id());
+	}
 
 	if (res == NEON_WALREAD_SUCCESS)
 	{

From cf6a776fcf27220015dbea2fe784cf1922d0549a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 27 Sep 2024 19:25:35 +0300
Subject: [PATCH 24/29] tests: Reduce the # of iterations in
 safekeeper::test_random_schedules (#9182)

To make it faster. On my laptop, it takes about 30 before this commit.
In the arm64 debug variant in CI, it takes about 120 s. Reduce it by
factor of 4.
---
 safekeeper/tests/random_test.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs
index 7bdee35cd7..1a932ef699 100644
--- a/safekeeper/tests/random_test.rs
+++ b/safekeeper/tests/random_test.rs
@@ -9,7 +9,7 @@ use crate::walproposer_sim::{
 
 pub mod walproposer_sim;
 
-// Generates 2000 random seeds and runs a schedule for each of them.
+// Generates 500 random seeds and runs a schedule for each of them.
 // If you see this test fail, please report the last seed to the
 // @safekeeper team.
 #[test]
@@ -17,7 +17,7 @@ fn test_random_schedules() -> anyhow::Result<()> {
     let clock = init_logger();
     let mut config = TestConfig::new(Some(clock));
 
-    for _ in 0..2000 {
+    for _ in 0..500 {
         let seed: u64 = rand::thread_rng().gen();
         config.network = generate_network_opts(seed);
 

From cde1654d7ba38881b2e34c2b4c2e15392726a190 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 27 Sep 2024 14:58:50 -0400
Subject: [PATCH 25/29] fix(pageserver): abort process if fsync fails (#9108)

close https://github.com/neondatabase/neon/issues/8140

The original issue is rather vague on what we should do. After
discussion w/ @problame we decided to narrow down the problems we want
to solve in that issue.

* read path -- do not panic for now.
* write path -- panic only on write errors (i.e., device error, fsync
error), but not on no-space for now.

The guideline is that if the pageserver behavior could lead to violation
of persistent constraints (i.e., return an operation as successful but
not actually persisting things), we should panic. Fsync is the place
where both of us agree that we should panic, because if fsync fails, the
kernel will mark dirty pages as clean, and the next fsync will not
necessarily return false. This would make the storage client assume the
operation is successful.

## Summary of changes

Make fsync panic on fatal errors.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/mgr.rs                             | 6 +++++-
 pageserver/src/tenant/remote_timeline_client/download.rs | 3 +++
 pageserver/src/tenant/storage_layer/delta_layer.rs       | 6 ++++--
 pageserver/src/tenant/storage_layer/image_layer.rs       | 6 ++++--
 pageserver/src/virtual_file.rs                           | 5 +++--
 5 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index f6249056d8..c7212e89ba 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -219,7 +219,11 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
         + TEMP_FILE_SUFFIX;
     let tmp_path = path_with_suffix_extension(&path, &rand_suffix);
     fs::rename(path.as_ref(), &tmp_path).await?;
-    fs::File::open(parent).await?.sync_all().await?;
+    fs::File::open(parent)
+        .await?
+        .sync_all()
+        .await
+        .maybe_fatal_err("safe_rename_tenant_dir")?;
     Ok(tmp_path)
 }
 
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 9fbe2f0da5..97506b7e9a 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -178,6 +178,7 @@ async fn download_object<'a>(
                 destination_file
                     .flush()
                     .await
+                    .maybe_fatal_err("download_object sync_all")
                     .with_context(|| format!("flush source file at {dst_path}"))
                     .map_err(DownloadError::Other)?;
 
@@ -185,6 +186,7 @@ async fn download_object<'a>(
                 destination_file
                     .sync_all()
                     .await
+                    .maybe_fatal_err("download_object sync_all")
                     .with_context(|| format!("failed to fsync source file at {dst_path}"))
                     .map_err(DownloadError::Other)?;
 
@@ -232,6 +234,7 @@ async fn download_object<'a>(
                 destination_file
                     .sync_all()
                     .await
+                    .maybe_fatal_err("download_object sync_all")
                     .with_context(|| format!("failed to fsync source file at {dst_path}"))
                     .map_err(DownloadError::Other)?;
 
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 9241ff569c..6f9eda85f5 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -44,7 +44,7 @@ use crate::tenant::vectored_blob_io::{
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
-use crate::virtual_file::{self, VirtualFile};
+use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
@@ -589,7 +589,9 @@ impl DeltaLayerWriterInner {
         );
 
         // fsync the file
-        file.sync_all().await?;
+        file.sync_all()
+            .await
+            .maybe_fatal_err("delta_layer sync_all")?;
 
         trace!("created delta layer {}", self.path);
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 940d169db0..3dcd7bc962 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -41,7 +41,7 @@ use crate::tenant::vectored_blob_io::{
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
-use crate::virtual_file::{self, VirtualFile};
+use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
@@ -889,7 +889,9 @@ impl ImageLayerWriterInner {
         // set inner.file here. The first read will have to re-open it.
 
         // fsync the file
-        file.sync_all().await?;
+        file.sync_all()
+            .await
+            .maybe_fatal_err("image_layer sync_all")?;
 
         trace!("created image layer {}", self.path);
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 17d9f94c95..5b7b279888 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -466,6 +466,7 @@ impl VirtualFile {
                 &[]
             };
             utils::crashsafe::overwrite(&final_path, &tmp_path, content)
+                .maybe_fatal_err("crashsafe_overwrite")
         })
         .await
         .expect("blocking task is never aborted")
@@ -475,7 +476,7 @@ impl VirtualFile {
     pub async fn sync_all(&self) -> Result<(), Error> {
         with_file!(self, StorageIoOperation::Fsync, |file_guard| {
             let (_file_guard, res) = io_engine::get().sync_all(file_guard).await;
-            res
+            res.maybe_fatal_err("sync_all")
         })
     }
 
@@ -483,7 +484,7 @@ impl VirtualFile {
     pub async fn sync_data(&self) -> Result<(), Error> {
         with_file!(self, StorageIoOperation::Fsync, |file_guard| {
             let (_file_guard, res) = io_engine::get().sync_data(file_guard).await;
-            res
+            res.maybe_fatal_err("sync_data")
         })
     }
 

From e989a5e4a204f05bf9ebab01182af0acf2b85328 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 27 Sep 2024 22:08:46 +0300
Subject: [PATCH 26/29] neon_local: Use clap derive macros to parse the CLI
 args (#9103)

This is easier to work with.
---
 control_plane/src/bin/neon_local.rs  | 1716 +++++++++++++-------------
 control_plane/src/branch_mappings.rs |   94 ++
 control_plane/src/safekeeper.rs      |    4 +-
 3 files changed, 949 insertions(+), 865 deletions(-)
 create mode 100644 control_plane/src/branch_mappings.rs

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index b6532ff2ac..624936620d 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -6,7 +6,7 @@
 //! rely on `neon_local` to set up the environment for each test.
 //!
 use anyhow::{anyhow, bail, Context, Result};
-use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
+use clap::Parser;
 use compute_api::spec::ComputeMode;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{
@@ -56,10 +56,627 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);
 
-const DEFAULT_PG_VERSION: &str = "16";
+const DEFAULT_PG_VERSION: u32 = 16;
 
 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
 
+#[derive(clap::Parser)]
+#[command(version = GIT_VERSION, about, name = "Neon CLI")]
+struct Cli {
+    #[command(subcommand)]
+    command: NeonLocalCmd,
+}
+
+#[derive(clap::Subcommand)]
+enum NeonLocalCmd {
+    Init(InitCmdArgs),
+
+    #[command(subcommand)]
+    Tenant(TenantCmd),
+    #[command(subcommand)]
+    Timeline(TimelineCmd),
+    #[command(subcommand)]
+    Pageserver(PageserverCmd),
+    #[command(subcommand)]
+    #[clap(alias = "storage_controller")]
+    StorageController(StorageControllerCmd),
+    #[command(subcommand)]
+    #[clap(alias = "storage_broker")]
+    StorageBroker(StorageBrokerCmd),
+    #[command(subcommand)]
+    Safekeeper(SafekeeperCmd),
+    #[command(subcommand)]
+    Endpoint(EndpointCmd),
+    #[command(subcommand)]
+    Mappings(MappingsCmd),
+
+    Start(StartCmdArgs),
+    Stop(StopCmdArgs),
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Initialize a new Neon repository, preparing configs for services to start with")]
+struct InitCmdArgs {
+    #[clap(long, help("How many pageservers to create (default 1)"))]
+    num_pageservers: Option<u16>,
+
+    #[clap(long)]
+    config: Option<PathBuf>,
+
+    #[clap(long, help("Force initialization even if the repository is not empty"))]
+    #[arg(value_parser)]
+    #[clap(default_value = "must-not-exist")]
+    force: InitForceMode,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Start pageserver and safekeepers")]
+struct StartCmdArgs {
+    #[clap(long = "start-timeout", default_value = "10s")]
+    timeout: humantime::Duration,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Stop pageserver and safekeepers")]
+struct StopCmdArgs {
+    #[arg(value_enum)]
+    #[clap(long, default_value_t = StopMode::Fast)]
+    mode: StopMode,
+}
+
+#[derive(Clone, Copy, clap::ValueEnum)]
+enum StopMode {
+    Fast,
+    Immediate,
+}
+
+#[derive(clap::Subcommand)]
+#[clap(about = "Manage tenants")]
+enum TenantCmd {
+    List,
+    Create(TenantCreateCmdArgs),
+    SetDefault(TenantSetDefaultCmdArgs),
+    Config(TenantConfigCmdArgs),
+    Import(TenantImportCmdArgs),
+}
+
+#[derive(clap::Args)]
+struct TenantCreateCmdArgs {
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
+    tenant_id: Option<TenantId>,
+
+    #[clap(
+        long,
+        help = "Use a specific timeline id when creating a tenant and its initial timeline"
+    )]
+    timeline_id: Option<TimelineId>,
+
+    #[clap(short = 'c')]
+    config: Vec<String>,
+
+    #[arg(default_value_t = DEFAULT_PG_VERSION)]
+    #[clap(long, help = "Postgres version to use for the initial timeline")]
+    pg_version: u32,
+
+    #[clap(
+        long,
+        help = "Use this tenant in future CLI commands where tenant_id is needed, but not specified"
+    )]
+    set_default: bool,
+
+    #[clap(long, help = "Number of shards in the new tenant")]
+    #[arg(default_value_t = 0)]
+    shard_count: u8,
+    #[clap(long, help = "Sharding stripe size in pages")]
+    shard_stripe_size: Option<u32>,
+
+    #[clap(long, help = "Placement policy shards in this tenant")]
+    #[arg(value_parser = parse_placement_policy)]
+    placement_policy: Option<PlacementPolicy>,
+}
+
+fn parse_placement_policy(s: &str) -> anyhow::Result<PlacementPolicy> {
+    Ok(serde_json::from_str::<PlacementPolicy>(s)?)
+}
+
+#[derive(clap::Args)]
+#[clap(
+    about = "Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"
+)]
+struct TenantSetDefaultCmdArgs {
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
+    tenant_id: TenantId,
+}
+
+#[derive(clap::Args)]
+struct TenantConfigCmdArgs {
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
+    tenant_id: Option<TenantId>,
+
+    #[clap(short = 'c')]
+    config: Vec<String>,
+}
+
+#[derive(clap::Args)]
+#[clap(
+    about = "Import a tenant that is present in remote storage, and create branches for its timelines"
+)]
+struct TenantImportCmdArgs {
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
+    tenant_id: TenantId,
+}
+
+#[derive(clap::Subcommand)]
+#[clap(about = "Manage timelines")]
+enum TimelineCmd {
+    List(TimelineListCmdArgs),
+    Branch(TimelineBranchCmdArgs),
+    Create(TimelineCreateCmdArgs),
+    Import(TimelineImportCmdArgs),
+}
+
+#[derive(clap::Args)]
+#[clap(about = "List all timelines available to this pageserver")]
+struct TimelineListCmdArgs {
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
+    tenant_shard_id: Option<TenantShardId>,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Create a new timeline, branching off from another timeline")]
+struct TimelineBranchCmdArgs {
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
+    tenant_id: Option<TenantId>,
+
+    #[clap(long, help = "New timeline's ID")]
+    timeline_id: Option<TimelineId>,
+
+    #[clap(long, help = "Human-readable alias for the new timeline")]
+    branch_name: String,
+
+    #[clap(
+        long,
+        help = "Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name."
+    )]
+    ancestor_branch_name: Option<String>,
+
+    #[clap(
+        long,
+        help = "When using another timeline as base, use a specific Lsn in it instead of the latest one"
+    )]
+    ancestor_start_lsn: Option<Lsn>,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Create a new blank timeline")]
+struct TimelineCreateCmdArgs {
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
+    tenant_id: Option<TenantId>,
+
+    #[clap(long, help = "New timeline's ID")]
+    timeline_id: Option<TimelineId>,
+
+    #[clap(long, help = "Human-readable alias for the new timeline")]
+    branch_name: String,
+
+    #[arg(default_value_t = DEFAULT_PG_VERSION)]
+    #[clap(long, help = "Postgres version")]
+    pg_version: u32,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Import timeline from a basebackup directory")]
+struct TimelineImportCmdArgs {
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
+    tenant_id: Option<TenantId>,
+
+    #[clap(long, help = "New timeline's ID")]
+    timeline_id: TimelineId,
+
+    #[clap(long, help = "Human-readable alias for the new timeline")]
+    branch_name: String,
+
+    #[clap(long, help = "Basebackup tarfile to import")]
+    base_tarfile: PathBuf,
+
+    #[clap(long, help = "Lsn the basebackup starts at")]
+    base_lsn: Lsn,
+
+    #[clap(long, help = "Wal to add after base")]
+    wal_tarfile: Option<PathBuf>,
+
+    #[clap(long, help = "Lsn the basebackup ends at")]
+    end_lsn: Option<Lsn>,
+
+    #[arg(default_value_t = DEFAULT_PG_VERSION)]
+    #[clap(long, help = "Postgres version of the backup being imported")]
+    pg_version: u32,
+}
+
+#[derive(clap::Subcommand)]
+#[clap(about = "Manage pageservers")]
+enum PageserverCmd {
+    Status(PageserverStatusCmdArgs),
+    Start(PageserverStartCmdArgs),
+    Stop(PageserverStopCmdArgs),
+    Restart(PageserverRestartCmdArgs),
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Show status of a local pageserver")]
+struct PageserverStatusCmdArgs {
+    #[clap(long = "id", help = "pageserver id")]
+    pageserver_id: Option<NodeId>,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Start local pageserver")]
+struct PageserverStartCmdArgs {
+    #[clap(long = "id", help = "pageserver id")]
+    pageserver_id: Option<NodeId>,
+
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    #[arg(default_value = "10s")]
+    start_timeout: humantime::Duration,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Stop local pageserver")]
+struct PageserverStopCmdArgs {
+    #[clap(long = "id", help = "pageserver id")]
+    pageserver_id: Option<NodeId>,
+
+    #[clap(
+        short = 'm',
+        help = "If 'immediate', don't flush repository data at shutdown"
+    )]
+    #[arg(value_enum, default_value = "fast")]
+    stop_mode: StopMode,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Restart local pageserver")]
+struct PageserverRestartCmdArgs {
+    #[clap(long = "id", help = "pageserver id")]
+    pageserver_id: Option<NodeId>,
+
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    #[arg(default_value = "10s")]
+    start_timeout: humantime::Duration,
+}
+
+#[derive(clap::Subcommand)]
+#[clap(about = "Manage storage controller")]
+enum StorageControllerCmd {
+    Start(StorageControllerStartCmdArgs),
+    Stop(StorageControllerStopCmdArgs),
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Start storage controller")]
+struct StorageControllerStartCmdArgs {
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    #[arg(default_value = "10s")]
+    start_timeout: humantime::Duration,
+
+    #[clap(
+        long,
+        help = "Identifier used to distinguish storage controller instances"
+    )]
+    #[arg(default_value_t = 1)]
+    instance_id: u8,
+
+    #[clap(
+        long,
+        help = "Base port for the storage controller instance idenfified by instance-id (defaults to pageserver cplane api)"
+    )]
+    base_port: Option<u16>,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Stop storage controller")]
+struct StorageControllerStopCmdArgs {
+    #[clap(
+        short = 'm',
+        help = "If 'immediate', don't flush repository data at shutdown"
+    )]
+    #[arg(value_enum, default_value = "fast")]
+    stop_mode: StopMode,
+
+    #[clap(
+        long,
+        help = "Identifier used to distinguish storage controller instances"
+    )]
+    #[arg(default_value_t = 1)]
+    instance_id: u8,
+}
+
+#[derive(clap::Subcommand)]
+#[clap(about = "Manage storage broker")]
+enum StorageBrokerCmd {
+    Start(StorageBrokerStartCmdArgs),
+    Stop(StorageBrokerStopCmdArgs),
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Start broker")]
+struct StorageBrokerStartCmdArgs {
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    #[arg(default_value = "10s")]
+    start_timeout: humantime::Duration,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "stop broker")]
+struct StorageBrokerStopCmdArgs {
+    #[clap(
+        short = 'm',
+        help = "If 'immediate', don't flush repository data at shutdown"
+    )]
+    #[arg(value_enum, default_value = "fast")]
+    stop_mode: StopMode,
+}
+
+#[derive(clap::Subcommand)]
+#[clap(about = "Manage safekeepers")]
+enum SafekeeperCmd {
+    Start(SafekeeperStartCmdArgs),
+    Stop(SafekeeperStopCmdArgs),
+    Restart(SafekeeperRestartCmdArgs),
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Start local safekeeper")]
+struct SafekeeperStartCmdArgs {
+    #[clap(help = "safekeeper id")]
+    #[arg(default_value_t = NodeId(1))]
+    id: NodeId,
+
+    #[clap(
+        short = 'e',
+        long = "safekeeper-extra-opt",
+        help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo"
+    )]
+    extra_opt: Vec<String>,
+
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    #[arg(default_value = "10s")]
+    start_timeout: humantime::Duration,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Stop local safekeeper")]
+struct SafekeeperStopCmdArgs {
+    #[clap(help = "safekeeper id")]
+    #[arg(default_value_t = NodeId(1))]
+    id: NodeId,
+
+    #[arg(value_enum, default_value = "fast")]
+    #[clap(
+        short = 'm',
+        help = "If 'immediate', don't flush repository data at shutdown"
+    )]
+    stop_mode: StopMode,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Restart local safekeeper")]
+struct SafekeeperRestartCmdArgs {
+    #[clap(help = "safekeeper id")]
+    #[arg(default_value_t = NodeId(1))]
+    id: NodeId,
+
+    #[arg(value_enum, default_value = "fast")]
+    #[clap(
+        short = 'm',
+        help = "If 'immediate', don't flush repository data at shutdown"
+    )]
+    stop_mode: StopMode,
+
+    #[clap(
+        short = 'e',
+        long = "safekeeper-extra-opt",
+        help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo"
+    )]
+    extra_opt: Vec<String>,
+
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    #[arg(default_value = "10s")]
+    start_timeout: humantime::Duration,
+}
+
+#[derive(clap::Subcommand)]
+#[clap(about = "Manage Postgres instances")]
+enum EndpointCmd {
+    List(EndpointListCmdArgs),
+    Create(EndpointCreateCmdArgs),
+    Start(EndpointStartCmdArgs),
+    Reconfigure(EndpointReconfigureCmdArgs),
+    Stop(EndpointStopCmdArgs),
+}
+
+#[derive(clap::Args)]
+#[clap(about = "List endpoints")]
+struct EndpointListCmdArgs {
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
+    tenant_shard_id: Option<TenantShardId>,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Create a compute endpoint")]
+struct EndpointCreateCmdArgs {
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
+    tenant_id: Option<TenantId>,
+
+    #[clap(help = "Postgres endpoint id")]
+    endpoint_id: Option<String>,
+    #[clap(long, help = "Name of the branch the endpoint will run on")]
+    branch_name: Option<String>,
+    #[clap(
+        long,
+        help = "Specify Lsn on the timeline to start from. By default, end of the timeline would be used"
+    )]
+    lsn: Option<Lsn>,
+    #[clap(long)]
+    pg_port: Option<u16>,
+    #[clap(long)]
+    http_port: Option<u16>,
+    #[clap(long = "pageserver-id")]
+    endpoint_pageserver_id: Option<NodeId>,
+
+    #[clap(
+        long,
+        help = "Don't do basebackup, create endpoint directory with only config files",
+        action = clap::ArgAction::Set,
+        default_value_t = false
+    )]
+    config_only: bool,
+
+    #[arg(default_value_t = DEFAULT_PG_VERSION)]
+    #[clap(long, help = "Postgres version")]
+    pg_version: u32,
+
+    #[clap(
+        long,
+        help = "If set, the node will be a hot replica on the specified timeline",
+        action = clap::ArgAction::Set,
+        default_value_t = false
+    )]
+    hot_standby: bool,
+
+    #[clap(long, help = "If set, will set up the catalog for neon_superuser")]
+    update_catalog: bool,
+
+    #[clap(
+        long,
+        help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests."
+    )]
+    allow_multiple: bool,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Start postgres. If the endpoint doesn't exist yet, it is created.")]
+struct EndpointStartCmdArgs {
+    #[clap(help = "Postgres endpoint id")]
+    endpoint_id: String,
+    #[clap(long = "pageserver-id")]
+    endpoint_pageserver_id: Option<NodeId>,
+
+    #[clap(long)]
+    safekeepers: Option<String>,
+
+    #[clap(
+        long,
+        help = "Configure the remote extensions storage proxy gateway to request for extensions."
+    )]
+    remote_ext_config: Option<String>,
+
+    #[clap(
+        long,
+        help = "If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`"
+    )]
+    create_test_user: bool,
+
+    #[clap(
+        long,
+        help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests."
+    )]
+    allow_multiple: bool,
+
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    #[arg(default_value = "10s")]
+    start_timeout: humantime::Duration,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Reconfigure an endpoint")]
+struct EndpointReconfigureCmdArgs {
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
+    tenant_id: Option<TenantId>,
+
+    #[clap(help = "Postgres endpoint id")]
+    endpoint_id: String,
+    #[clap(long = "pageserver-id")]
+    endpoint_pageserver_id: Option<NodeId>,
+
+    #[clap(long)]
+    safekeepers: Option<String>,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Stop an endpoint")]
+struct EndpointStopCmdArgs {
+    #[clap(help = "Postgres endpoint id")]
+    endpoint_id: String,
+
+    #[clap(
+        long,
+        help = "Also delete data directory (now optional, should be default in future)"
+    )]
+    destroy: bool,
+
+    #[clap(long, help = "Postgres shutdown mode, passed to \"pg_ctl -m <mode>\"")]
+    #[arg(value_parser(["smart", "fast", "immediate"]))]
+    #[arg(default_value = "fast")]
+    mode: String,
+}
+
+#[derive(clap::Subcommand)]
+#[clap(about = "Manage neon_local branch name mappings")]
+enum MappingsCmd {
+    Map(MappingsMapCmdArgs),
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Create new mapping which cannot exist already")]
+struct MappingsMapCmdArgs {
+    #[clap(
+        long,
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
+    tenant_id: TenantId,
+    #[clap(
+        long,
+        help = "Timeline id. Represented as a hexadecimal string 32 symbols length"
+    )]
+    timeline_id: TimelineId,
+    #[clap(long, help = "Branch name to give to the timeline")]
+    branch_name: String,
+}
+
 ///
 /// Timelines tree element used as a value in the HashMap.
 ///
@@ -80,19 +697,13 @@ struct TimelineTreeEl {
 //   * Providing CLI api to the pageserver
 //   * TODO: export/import to/from usual postgres
 fn main() -> Result<()> {
-    let matches = cli().get_matches();
-
-    let (sub_name, sub_args) = match matches.subcommand() {
-        Some(subcommand_data) => subcommand_data,
-        None => bail!("no subcommand provided"),
-    };
+    let cli = Cli::parse();
 
     // Check for 'neon init' command first.
-    let subcommand_result = if sub_name == "init" {
-        handle_init(sub_args).map(|env| Some(Cow::Owned(env)))
+    let subcommand_result = if let NeonLocalCmd::Init(args) = cli.command {
+        handle_init(&args).map(|env| Some(Cow::Owned(env)))
     } else {
         // all other commands need an existing config
-
         let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
         let original_env = env.clone();
         let env = Box::leak(Box::new(env));
@@ -101,19 +712,20 @@ fn main() -> Result<()> {
             .build()
             .unwrap();
 
-        let subcommand_result = match sub_name {
-            "tenant" => rt.block_on(handle_tenant(sub_args, env)),
-            "timeline" => rt.block_on(handle_timeline(sub_args, env)),
-            "start" => rt.block_on(handle_start_all(env, get_start_timeout(sub_args))),
-            "stop" => rt.block_on(handle_stop_all(sub_args, env)),
-            "pageserver" => rt.block_on(handle_pageserver(sub_args, env)),
-            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, env)),
-            "storage_broker" => rt.block_on(handle_storage_broker(sub_args, env)),
-            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, env)),
-            "endpoint" => rt.block_on(handle_endpoint(sub_args, env)),
-            "mappings" => handle_mappings(sub_args, env),
-            "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
-            _ => bail!("unexpected subcommand {sub_name}"),
+        let subcommand_result = match cli.command {
+            NeonLocalCmd::Init(_) => unreachable!("init was handled earlier already"),
+            NeonLocalCmd::Start(args) => rt.block_on(handle_start_all(&args, env)),
+            NeonLocalCmd::Stop(args) => rt.block_on(handle_stop_all(&args, env)),
+            NeonLocalCmd::Tenant(subcmd) => rt.block_on(handle_tenant(&subcmd, env)),
+            NeonLocalCmd::Timeline(subcmd) => rt.block_on(handle_timeline(&subcmd, env)),
+            NeonLocalCmd::Pageserver(subcmd) => rt.block_on(handle_pageserver(&subcmd, env)),
+            NeonLocalCmd::StorageController(subcmd) => {
+                rt.block_on(handle_storage_controller(&subcmd, env))
+            }
+            NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)),
+            NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)),
+            NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)),
+            NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env),
         };
 
         if &original_env != env {
@@ -263,10 +875,13 @@ async fn get_timeline_infos(
         .collect())
 }
 
-// Helper function to parse --tenant_id option, or get the default from config file
-fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<TenantId> {
-    if let Some(tenant_id_from_arguments) = parse_tenant_id(sub_match).transpose() {
-        tenant_id_from_arguments
+/// Helper function to get tenant id from an optional --tenant_id option or from the config file
+fn get_tenant_id(
+    tenant_id_arg: Option<TenantId>,
+    env: &local_env::LocalEnv,
+) -> anyhow::Result<TenantId> {
+    if let Some(tenant_id_from_arguments) = tenant_id_arg {
+        Ok(tenant_id_from_arguments)
     } else if let Some(default_id) = env.default_tenant_id {
         Ok(default_id)
     } else {
@@ -274,13 +889,14 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
     }
 }
 
-// Helper function to parse --tenant_id option, for commands that accept a shard suffix
+/// Helper function to get tenant-shard ID from an optional --tenant_id option or from the config file,
+/// for commands that accept a shard suffix
 fn get_tenant_shard_id(
-    sub_match: &ArgMatches,
+    tenant_shard_id_arg: Option<TenantShardId>,
     env: &local_env::LocalEnv,
 ) -> anyhow::Result<TenantShardId> {
-    if let Some(tenant_id_from_arguments) = parse_tenant_shard_id(sub_match).transpose() {
-        tenant_id_from_arguments
+    if let Some(tenant_id_from_arguments) = tenant_shard_id_arg {
+        Ok(tenant_id_from_arguments)
     } else if let Some(default_id) = env.default_tenant_id {
         Ok(TenantShardId::unsharded(default_id))
     } else {
@@ -288,41 +904,11 @@ fn get_tenant_shard_id(
     }
 }
 
-fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
-    sub_match
-        .get_one::<String>("tenant-id")
-        .map(|tenant_id| TenantId::from_str(tenant_id))
-        .transpose()
-        .context("Failed to parse tenant id from the argument string")
-}
-
-fn parse_tenant_shard_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantShardId>> {
-    sub_match
-        .get_one::<String>("tenant-id")
-        .map(|id_str| TenantShardId::from_str(id_str))
-        .transpose()
-        .context("Failed to parse tenant shard id from the argument string")
-}
-
-fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId>> {
-    sub_match
-        .get_one::<String>("timeline-id")
-        .map(|timeline_id| TimelineId::from_str(timeline_id))
-        .transpose()
-        .context("Failed to parse timeline id from the argument string")
-}
-
-fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
-    let num_pageservers = init_match.get_one::<u16>("num-pageservers");
-
-    let force = init_match.get_one("force").expect("we set a default value");
-
+fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
     // Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`.
-    let init_conf: NeonLocalInitConf = if let Some(config_path) =
-        init_match.get_one::<PathBuf>("config")
-    {
+    let init_conf: NeonLocalInitConf = if let Some(config_path) = &args.config {
         // User (likely the Python test suite) provided a description of the environment.
-        if num_pageservers.is_some() {
+        if args.num_pageservers.is_some() {
             bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead");
         }
         // load and parse the file
@@ -346,7 +932,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
                 http_port: DEFAULT_SAFEKEEPER_HTTP_PORT,
                 ..Default::default()
             }],
-            pageservers: (0..num_pageservers.copied().unwrap_or(1))
+            pageservers: (0..args.num_pageservers.unwrap_or(1))
                 .map(|i| {
                     let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
                     let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
@@ -369,7 +955,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
         }
     };
 
-    LocalEnv::init(init_conf, force)
+    LocalEnv::init(init_conf, &args.force)
         .context("materialize initial neon_local environment on disk")?;
     Ok(LocalEnv::load_config(&local_env::base_path())
         .expect("freshly written config should be loadable"))
@@ -387,19 +973,16 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
     PageServerNode::from_env(env, ps_conf)
 }
 
-async fn handle_tenant(
-    tenant_match: &ArgMatches,
-    env: &mut local_env::LocalEnv,
-) -> anyhow::Result<()> {
+async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
     let pageserver = get_default_pageserver(env);
-    match tenant_match.subcommand() {
-        Some(("list", _)) => {
+    match subcmd {
+        TenantCmd::List => {
             for t in pageserver.tenant_list().await? {
                 println!("{} {:?}", t.id, t.state);
             }
         }
-        Some(("import", import_match)) => {
-            let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
+        TenantCmd::Import(args) => {
+            let tenant_id = args.tenant_id;
 
             let storage_controller = StorageController::from_env(env);
             let create_response = storage_controller.tenant_import(tenant_id).await?;
@@ -446,31 +1029,14 @@ async fn handle_tenant(
                 env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
             }
         }
-        Some(("create", create_match)) => {
-            let tenant_conf: HashMap<_, _> = create_match
-                .get_many::<String>("config")
-                .map(|vals: clap::parser::ValuesRef<'_, String>| {
-                    vals.flat_map(|c| c.split_once(':')).collect()
-                })
-                .unwrap_or_default();
-
-            let shard_count: u8 = create_match
-                .get_one::<u8>("shard-count")
-                .cloned()
-                .unwrap_or(0);
-
-            let shard_stripe_size: Option<u32> =
-                create_match.get_one::<u32>("shard-stripe-size").cloned();
-
-            let placement_policy = match create_match.get_one::<String>("placement-policy") {
-                Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
-                _ => PlacementPolicy::Attached(0),
-            };
+        TenantCmd::Create(args) => {
+            let tenant_conf: HashMap<_, _> =
+                args.config.iter().flat_map(|c| c.split_once(':')).collect();
 
             let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
 
             // If tenant ID was not specified, generate one
-            let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
+            let tenant_id = args.tenant_id.unwrap_or_else(TenantId::generate);
 
             // We must register the tenant with the storage controller, so
             // that when the pageserver restarts, it will be re-attached.
@@ -478,29 +1044,26 @@ async fn handle_tenant(
             storage_controller
                 .tenant_create(TenantCreateRequest {
                     // Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
-                    // storage controller expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
-                    // type is used both in storage controller (for creating tenants) and in pageserver (for creating shards)
+                    // storage controller expects a shard-naive tenant_id in this attribute, and the TenantCreateRequest
+                    // type is used both in the storage controller (for creating tenants) and in the pageserver (for
+                    // creating shards)
                     new_tenant_id: TenantShardId::unsharded(tenant_id),
                     generation: None,
                     shard_parameters: ShardParameters {
-                        count: ShardCount::new(shard_count),
-                        stripe_size: shard_stripe_size
+                        count: ShardCount::new(args.shard_count),
+                        stripe_size: args
+                            .shard_stripe_size
                             .map(ShardStripeSize)
                             .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
                     },
-                    placement_policy: Some(placement_policy),
+                    placement_policy: args.placement_policy.clone(),
                     config: tenant_conf,
                 })
                 .await?;
             println!("tenant {tenant_id} successfully created on the pageserver");
 
             // Create an initial timeline for the new tenant
-            let new_timeline_id =
-                parse_timeline_id(create_match)?.unwrap_or(TimelineId::generate());
-            let pg_version = create_match
-                .get_one::<u32>("pg-version")
-                .copied()
-                .context("Failed to parse postgres version from the argument string")?;
+            let new_timeline_id = args.timeline_id.unwrap_or(TimelineId::generate());
 
             // FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
             // different shards picking different start lsns.  Maybe we have to teach storage controller
@@ -513,7 +1076,7 @@ async fn handle_tenant(
                         ancestor_timeline_id: None,
                         ancestor_start_lsn: None,
                         existing_initdb_timeline_id: None,
-                        pg_version: Some(pg_version),
+                        pg_version: Some(args.pg_version),
                     },
                 )
                 .await?;
@@ -526,23 +1089,19 @@ async fn handle_tenant(
 
             println!("Created an initial timeline '{new_timeline_id}' for tenant: {tenant_id}",);
 
-            if create_match.get_flag("set-default") {
+            if args.set_default {
                 println!("Setting tenant {tenant_id} as a default one");
                 env.default_tenant_id = Some(tenant_id);
             }
         }
-        Some(("set-default", set_default_match)) => {
-            let tenant_id =
-                parse_tenant_id(set_default_match)?.context("No tenant id specified")?;
-            println!("Setting tenant {tenant_id} as a default one");
-            env.default_tenant_id = Some(tenant_id);
+        TenantCmd::SetDefault(args) => {
+            println!("Setting tenant {} as a default one", args.tenant_id);
+            env.default_tenant_id = Some(args.tenant_id);
         }
-        Some(("config", create_match)) => {
-            let tenant_id = get_tenant_id(create_match, env)?;
-            let tenant_conf: HashMap<_, _> = create_match
-                .get_many::<String>("config")
-                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
-                .unwrap_or_default();
+        TenantCmd::Config(args) => {
+            let tenant_id = get_tenant_id(args.tenant_id, env)?;
+            let tenant_conf: HashMap<_, _> =
+                args.config.iter().flat_map(|c| c.split_once(':')).collect();
 
             pageserver
                 .tenant_config(tenant_id, tenant_conf)
@@ -550,36 +1109,25 @@ async fn handle_tenant(
                 .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
             println!("tenant {tenant_id} successfully configured on the pageserver");
         }
-
-        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
-        None => bail!("no tenant subcommand provided"),
     }
     Ok(())
 }
 
-async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
+async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Result<()> {
     let pageserver = get_default_pageserver(env);
 
-    match timeline_match.subcommand() {
-        Some(("list", list_match)) => {
+    match cmd {
+        TimelineCmd::List(args) => {
             // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
             // where shard 0 is attached, and query there.
-            let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
+            let tenant_shard_id = get_tenant_shard_id(args.tenant_shard_id, env)?;
             let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
             print_timelines_tree(timelines, env.timeline_name_mappings())?;
         }
-        Some(("create", create_match)) => {
-            let tenant_id = get_tenant_id(create_match, env)?;
-            let new_branch_name = create_match
-                .get_one::<String>("branch-name")
-                .ok_or_else(|| anyhow!("No branch name provided"))?;
-
-            let pg_version = create_match
-                .get_one::<u32>("pg-version")
-                .copied()
-                .context("Failed to parse postgres version from the argument string")?;
-
-            let new_timeline_id_opt = parse_timeline_id(create_match)?;
+        TimelineCmd::Create(args) => {
+            let tenant_id = get_tenant_id(args.tenant_id, env)?;
+            let new_branch_name = &args.branch_name;
+            let new_timeline_id_opt = args.timeline_id;
             let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());
 
             let storage_controller = StorageController::from_env(env);
@@ -588,7 +1136,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                 ancestor_timeline_id: None,
                 existing_initdb_timeline_id: None,
                 ancestor_start_lsn: None,
-                pg_version: Some(pg_version),
+                pg_version: Some(args.pg_version),
             };
             let timeline_info = storage_controller
                 .tenant_timeline_create(tenant_id, create_req)
@@ -602,67 +1150,42 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                 timeline_info.timeline_id
             );
         }
-        Some(("import", import_match)) => {
-            let tenant_id = get_tenant_id(import_match, env)?;
-            let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided");
-            let branch_name = import_match
-                .get_one::<String>("branch-name")
-                .ok_or_else(|| anyhow!("No branch name provided"))?;
+        TimelineCmd::Import(args) => {
+            let tenant_id = get_tenant_id(args.tenant_id, env)?;
+            let timeline_id = args.timeline_id;
+            let branch_name = &args.branch_name;
 
             // Parse base inputs
-            let base_tarfile = import_match
-                .get_one::<PathBuf>("base-tarfile")
-                .ok_or_else(|| anyhow!("No base-tarfile provided"))?
-                .to_owned();
-            let base_lsn = Lsn::from_str(
-                import_match
-                    .get_one::<String>("base-lsn")
-                    .ok_or_else(|| anyhow!("No base-lsn provided"))?,
-            )?;
-            let base = (base_lsn, base_tarfile);
+            let base = (args.base_lsn, args.base_tarfile.clone());
 
             // Parse pg_wal inputs
-            let wal_tarfile = import_match.get_one::<PathBuf>("wal-tarfile").cloned();
-            let end_lsn = import_match
-                .get_one::<String>("end-lsn")
-                .map(|s| Lsn::from_str(s).unwrap());
+            let wal_tarfile = args.wal_tarfile.clone();
+            let end_lsn = args.end_lsn;
             // TODO validate both or none are provided
             let pg_wal = end_lsn.zip(wal_tarfile);
 
-            let pg_version = import_match
-                .get_one::<u32>("pg-version")
-                .copied()
-                .context("Failed to parse postgres version from the argument string")?;
-
             println!("Importing timeline into pageserver ...");
             pageserver
-                .timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)
+                .timeline_import(tenant_id, timeline_id, base, pg_wal, args.pg_version)
                 .await?;
             env.register_branch_mapping(branch_name.to_string(), tenant_id, timeline_id)?;
             println!("Done");
         }
-        Some(("branch", branch_match)) => {
-            let tenant_id = get_tenant_id(branch_match, env)?;
-            let new_timeline_id =
-                parse_timeline_id(branch_match)?.unwrap_or(TimelineId::generate());
-            let new_branch_name = branch_match
-                .get_one::<String>("branch-name")
-                .ok_or_else(|| anyhow!("No branch name provided"))?;
-            let ancestor_branch_name = branch_match
-                .get_one::<String>("ancestor-branch-name")
-                .map(|s| s.as_str())
-                .unwrap_or(DEFAULT_BRANCH_NAME);
+        TimelineCmd::Branch(args) => {
+            let tenant_id = get_tenant_id(args.tenant_id, env)?;
+            let new_timeline_id = args.timeline_id.unwrap_or(TimelineId::generate());
+            let new_branch_name = &args.branch_name;
+            let ancestor_branch_name = args
+                .ancestor_branch_name
+                .clone()
+                .unwrap_or(DEFAULT_BRANCH_NAME.to_owned());
             let ancestor_timeline_id = env
-                .get_branch_timeline_id(ancestor_branch_name, tenant_id)
+                .get_branch_timeline_id(&ancestor_branch_name, tenant_id)
                 .ok_or_else(|| {
                     anyhow!("Found no timeline id for branch name '{ancestor_branch_name}'")
                 })?;
 
-            let start_lsn = branch_match
-                .get_one::<String>("ancestor-start-lsn")
-                .map(|lsn_str| Lsn::from_str(lsn_str))
-                .transpose()
-                .context("Failed to parse ancestor start Lsn from the request")?;
+            let start_lsn = args.ancestor_start_lsn;
             let storage_controller = StorageController::from_env(env);
             let create_req = TimelineCreateRequest {
                 new_timeline_id,
@@ -684,25 +1207,19 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                 timeline_info.timeline_id
             );
         }
-        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{sub_name}'"),
-        None => bail!("no tenant subcommand provided"),
     }
 
     Ok(())
 }
 
-async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let (sub_name, sub_args) = match ep_match.subcommand() {
-        Some(ep_subcommand_data) => ep_subcommand_data,
-        None => bail!("no endpoint subcommand provided"),
-    };
+async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Result<()> {
     let mut cplane = ComputeControlPlane::load(env.clone())?;
 
-    match sub_name {
-        "list" => {
+    match subcmd {
+        EndpointCmd::List(args) => {
             // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
             // where shard 0 is attached, and query there.
-            let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
+            let tenant_shard_id = get_tenant_shard_id(args.tenant_shard_id, env)?;
             let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
                 .await
                 .unwrap_or_else(|e| {
@@ -766,52 +1283,29 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
 
             println!("{table}");
         }
-        "create" => {
-            let tenant_id = get_tenant_id(sub_args, env)?;
-            let branch_name = sub_args
-                .get_one::<String>("branch-name")
-                .map(|s| s.as_str())
-                .unwrap_or(DEFAULT_BRANCH_NAME);
-            let endpoint_id = sub_args
-                .get_one::<String>("endpoint_id")
-                .map(String::to_string)
+        EndpointCmd::Create(args) => {
+            let tenant_id = get_tenant_id(args.tenant_id, env)?;
+            let branch_name = args
+                .branch_name
+                .clone()
+                .unwrap_or(DEFAULT_BRANCH_NAME.to_owned());
+            let endpoint_id = args
+                .endpoint_id
+                .clone()
                 .unwrap_or_else(|| format!("ep-{branch_name}"));
-            let update_catalog = sub_args
-                .get_one::<bool>("update-catalog")
-                .cloned()
-                .unwrap_or_default();
 
-            let lsn = sub_args
-                .get_one::<String>("lsn")
-                .map(|lsn_str| Lsn::from_str(lsn_str))
-                .transpose()
-                .context("Failed to parse Lsn from the request")?;
             let timeline_id = env
-                .get_branch_timeline_id(branch_name, tenant_id)
+                .get_branch_timeline_id(&branch_name, tenant_id)
                 .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?;
 
-            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
-            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
-            let pg_version = sub_args
-                .get_one::<u32>("pg-version")
-                .copied()
-                .context("Failed to parse postgres version from the argument string")?;
-
-            let hot_standby = sub_args
-                .get_one::<bool>("hot-standby")
-                .copied()
-                .unwrap_or(false);
-
-            let allow_multiple = sub_args.get_flag("allow-multiple");
-
-            let mode = match (lsn, hot_standby) {
+            let mode = match (args.lsn, args.hot_standby) {
                 (Some(lsn), false) => ComputeMode::Static(lsn),
                 (None, true) => ComputeMode::Replica,
                 (None, false) => ComputeMode::Primary,
                 (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
             };
 
-            match (mode, hot_standby) {
+            match (mode, args.hot_standby) {
                 (ComputeMode::Static(_), true) => {
                     bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
                 }
@@ -821,7 +1315,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 _ => {}
             }
 
-            if !allow_multiple {
+            if !args.allow_multiple {
                 cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
             }
 
@@ -829,34 +1323,21 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 &endpoint_id,
                 tenant_id,
                 timeline_id,
-                pg_port,
-                http_port,
-                pg_version,
+                args.pg_port,
+                args.http_port,
+                args.pg_version,
                 mode,
-                !update_catalog,
+                !args.update_catalog,
             )?;
         }
-        "start" => {
-            let endpoint_id = sub_args
-                .get_one::<String>("endpoint_id")
-                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
-
-            let pageserver_id =
-                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    Some(NodeId(
-                        id_str.parse().context("while parsing pageserver id")?,
-                    ))
-                } else {
-                    None
-                };
-
-            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
-
-            let allow_multiple = sub_args.get_flag("allow-multiple");
+        EndpointCmd::Start(args) => {
+            let endpoint_id = &args.endpoint_id;
+            let pageserver_id = args.endpoint_pageserver_id;
+            let remote_ext_config = &args.remote_ext_config;
 
             // If --safekeepers argument is given, use only the listed
             // safekeeper nodes; otherwise all from the env.
-            let safekeepers = if let Some(safekeepers) = parse_safekeepers(sub_args)? {
+            let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? {
                 safekeepers
             } else {
                 env.safekeepers.iter().map(|sk| sk.id).collect()
@@ -867,12 +1348,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 .get(endpoint_id.as_str())
                 .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
 
-            let create_test_user = sub_args
-                .get_one::<bool>("create-test-user")
-                .cloned()
-                .unwrap_or_default();
-
-            if !allow_multiple {
+            if !args.allow_multiple {
                 cplane.check_conflicting_endpoints(
                     endpoint.mode,
                     endpoint.tenant_id,
@@ -936,72 +1412,61 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                     &auth_token,
                     safekeepers,
                     pageservers,
-                    remote_ext_config,
+                    remote_ext_config.as_ref(),
                     stripe_size.0 as usize,
-                    create_test_user,
+                    args.create_test_user,
                 )
                 .await?;
         }
-        "reconfigure" => {
-            let endpoint_id = sub_args
-                .get_one::<String>("endpoint_id")
-                .ok_or_else(|| anyhow!("No endpoint ID provided to reconfigure"))?;
+        EndpointCmd::Reconfigure(args) => {
+            let endpoint_id = &args.endpoint_id;
             let endpoint = cplane
                 .endpoints
                 .get(endpoint_id.as_str())
                 .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            let pageservers =
-                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    let ps_id = NodeId(id_str.parse().context("while parsing pageserver id")?);
-                    let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?);
-                    vec![(
-                        pageserver.pg_connection_config.host().clone(),
-                        pageserver.pg_connection_config.port(),
-                    )]
-                } else {
-                    let storage_controller = StorageController::from_env(env);
-                    storage_controller
-                        .tenant_locate(endpoint.tenant_id)
-                        .await?
-                        .shards
-                        .into_iter()
-                        .map(|shard| {
-                            (
-                                Host::parse(&shard.listen_pg_addr)
-                                    .expect("Storage controller reported malformed host"),
-                                shard.listen_pg_port,
-                            )
-                        })
-                        .collect::<Vec<_>>()
-                };
+            let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
+                let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?);
+                vec![(
+                    pageserver.pg_connection_config.host().clone(),
+                    pageserver.pg_connection_config.port(),
+                )]
+            } else {
+                let storage_controller = StorageController::from_env(env);
+                storage_controller
+                    .tenant_locate(endpoint.tenant_id)
+                    .await?
+                    .shards
+                    .into_iter()
+                    .map(|shard| {
+                        (
+                            Host::parse(&shard.listen_pg_addr)
+                                .expect("Storage controller reported malformed host"),
+                            shard.listen_pg_port,
+                        )
+                    })
+                    .collect::<Vec<_>>()
+            };
             // If --safekeepers argument is given, use only the listed
             // safekeeper nodes; otherwise all from the env.
-            let safekeepers = parse_safekeepers(sub_args)?;
+            let safekeepers = parse_safekeepers(&args.safekeepers)?;
             endpoint.reconfigure(pageservers, None, safekeepers).await?;
         }
-        "stop" => {
-            let endpoint_id = sub_args
-                .get_one::<String>("endpoint_id")
-                .ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?;
-            let destroy = sub_args.get_flag("destroy");
-            let mode = sub_args.get_one::<String>("mode").expect("has a default");
-
+        EndpointCmd::Stop(args) => {
+            let endpoint_id = &args.endpoint_id;
             let endpoint = cplane
                 .endpoints
-                .get(endpoint_id.as_str())
+                .get(endpoint_id)
                 .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            endpoint.stop(mode, destroy)?;
+            endpoint.stop(&args.mode, args.destroy)?;
         }
-
-        _ => bail!("Unexpected endpoint subcommand '{sub_name}'"),
     }
 
     Ok(())
 }
 
 /// Parse --safekeepers as list of safekeeper ids.
-fn parse_safekeepers(sub_args: &ArgMatches) -> Result<Option<Vec<NodeId>>> {
-    if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
+fn parse_safekeepers(safekeepers_str: &Option<String>) -> Result<Option<Vec<NodeId>>> {
+    if let Some(safekeepers_str) = safekeepers_str {
         let mut safekeepers: Vec<NodeId> = Vec::new();
         for sk_id in safekeepers_str.split(',').map(str::trim) {
             let sk_id = NodeId(
@@ -1016,44 +1481,25 @@ fn parse_safekeepers(sub_args: &ArgMatches) -> Result<Option<Vec<NodeId>>> {
     }
 }
 
-fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
-    let (sub_name, sub_args) = match sub_match.subcommand() {
-        Some(ep_subcommand_data) => ep_subcommand_data,
-        None => bail!("no mappings subcommand provided"),
-    };
-
-    match sub_name {
-        "map" => {
-            let branch_name = sub_args
-                .get_one::<String>("branch-name")
-                .expect("branch-name argument missing");
-
-            let tenant_id = sub_args
-                .get_one::<String>("tenant-id")
-                .map(|x| TenantId::from_str(x))
-                .expect("tenant-id argument missing")
-                .expect("malformed tenant-id arg");
-
-            let timeline_id = sub_args
-                .get_one::<String>("timeline-id")
-                .map(|x| TimelineId::from_str(x))
-                .expect("timeline-id argument missing")
-                .expect("malformed timeline-id arg");
-
-            env.register_branch_mapping(branch_name.to_owned(), tenant_id, timeline_id)?;
+fn handle_mappings(subcmd: &MappingsCmd, env: &mut local_env::LocalEnv) -> Result<()> {
+    match subcmd {
+        MappingsCmd::Map(args) => {
+            env.register_branch_mapping(
+                args.branch_name.to_owned(),
+                args.tenant_id,
+                args.timeline_id,
+            )?;
 
             Ok(())
         }
-        other => unimplemented!("mappings subcommand {other}"),
     }
 }
 
-fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
-    let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
-        NodeId(id_str.parse().context("while parsing pageserver id")?)
-    } else {
-        DEFAULT_PAGESERVER_ID
-    };
+fn get_pageserver(
+    env: &local_env::LocalEnv,
+    pageserver_id_arg: Option<NodeId>,
+) -> Result<PageServerNode> {
+    let node_id = pageserver_id_arg.unwrap_or(DEFAULT_PAGESERVER_ID);
 
     Ok(PageServerNode::from_env(
         env,
@@ -1061,48 +1507,11 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
     ))
 }
 
-fn get_start_timeout(args: &ArgMatches) -> &Duration {
-    let humantime_duration = args
-        .get_one::<humantime::Duration>("start-timeout")
-        .expect("invalid value for start-timeout");
-    humantime_duration.as_ref()
-}
-
-fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs {
-    let maybe_instance_id = args.get_one::<u8>("instance-id");
-
-    let base_port = args.get_one::<u16>("base-port");
-
-    if maybe_instance_id.is_some() && base_port.is_none() {
-        panic!("storage-controller start specificied instance-id but did not provide base-port");
-    }
-
-    let start_timeout = args
-        .get_one::<humantime::Duration>("start-timeout")
-        .expect("invalid value for start-timeout");
-
-    NeonStorageControllerStartArgs {
-        instance_id: maybe_instance_id.copied().unwrap_or(1),
-        base_port: base_port.copied(),
-        start_timeout: *start_timeout,
-    }
-}
-
-fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs {
-    let maybe_instance_id = args.get_one::<u8>("instance-id");
-    let immediate = args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
-
-    NeonStorageControllerStopArgs {
-        instance_id: maybe_instance_id.copied().unwrap_or(1),
-        immediate,
-    }
-}
-
-async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    match sub_match.subcommand() {
-        Some(("start", subcommand_args)) => {
-            if let Err(e) = get_pageserver(env, subcommand_args)?
-                .start(get_start_timeout(subcommand_args))
+async fn handle_pageserver(subcmd: &PageserverCmd, env: &local_env::LocalEnv) -> Result<()> {
+    match subcmd {
+        PageserverCmd::Start(args) => {
+            if let Err(e) = get_pageserver(env, args.pageserver_id)?
+                .start(&args.start_timeout)
                 .await
             {
                 eprintln!("pageserver start failed: {e}");
@@ -1110,34 +1519,36 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
             }
         }
 
-        Some(("stop", subcommand_args)) => {
-            let immediate = subcommand_args
-                .get_one::<String>("stop-mode")
-                .map(|s| s.as_str())
-                == Some("immediate");
-
-            if let Err(e) = get_pageserver(env, subcommand_args)?.stop(immediate) {
+        PageserverCmd::Stop(args) => {
+            let immediate = match args.stop_mode {
+                StopMode::Fast => false,
+                StopMode::Immediate => true,
+            };
+            if let Err(e) = get_pageserver(env, args.pageserver_id)?.stop(immediate) {
                 eprintln!("pageserver stop failed: {}", e);
                 exit(1);
             }
         }
 
-        Some(("restart", subcommand_args)) => {
-            let pageserver = get_pageserver(env, subcommand_args)?;
+        PageserverCmd::Restart(args) => {
+            let pageserver = get_pageserver(env, args.pageserver_id)?;
             //TODO what shutdown strategy should we use here?
             if let Err(e) = pageserver.stop(false) {
                 eprintln!("pageserver stop failed: {}", e);
                 exit(1);
             }
 
-            if let Err(e) = pageserver.start(get_start_timeout(sub_match)).await {
+            if let Err(e) = pageserver.start(&args.start_timeout).await {
                 eprintln!("pageserver start failed: {e}");
                 exit(1);
             }
         }
 
-        Some(("status", subcommand_args)) => {
-            match get_pageserver(env, subcommand_args)?.check_status().await {
+        PageserverCmd::Status(args) => {
+            match get_pageserver(env, args.pageserver_id)?
+                .check_status()
+                .await
+            {
                 Ok(_) => println!("Page server is up and running"),
                 Err(err) => {
                     eprintln!("Page server is not available: {}", err);
@@ -1145,34 +1556,42 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
                 }
             }
         }
-
-        Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name),
-        None => bail!("no pageserver subcommand provided"),
     }
     Ok(())
 }
 
 async fn handle_storage_controller(
-    sub_match: &ArgMatches,
+    subcmd: &StorageControllerCmd,
     env: &local_env::LocalEnv,
 ) -> Result<()> {
     let svc = StorageController::from_env(env);
-    match sub_match.subcommand() {
-        Some(("start", start_match)) => {
-            if let Err(e) = svc.start(storage_controller_start_args(start_match)).await {
+    match subcmd {
+        StorageControllerCmd::Start(args) => {
+            let start_args = NeonStorageControllerStartArgs {
+                instance_id: args.instance_id,
+                base_port: args.base_port,
+                start_timeout: args.start_timeout,
+            };
+
+            if let Err(e) = svc.start(start_args).await {
                 eprintln!("start failed: {e}");
                 exit(1);
             }
         }
 
-        Some(("stop", stop_match)) => {
-            if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await {
+        StorageControllerCmd::Stop(args) => {
+            let stop_args = NeonStorageControllerStopArgs {
+                instance_id: args.instance_id,
+                immediate: match args.stop_mode {
+                    StopMode::Fast => false,
+                    StopMode::Immediate => true,
+                },
+            };
+            if let Err(e) = svc.stop(stop_args).await {
                 eprintln!("stop failed: {}", e);
                 exit(1);
             }
         }
-        Some((sub_name, _)) => bail!("Unexpected storage_controller subcommand '{}'", sub_name),
-        None => bail!("no storage_controller subcommand provided"),
     }
     Ok(())
 }
@@ -1185,111 +1604,77 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
     }
 }
 
-// Get list of options to append to safekeeper command invocation.
-fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
-    init_match
-        .get_many::<String>("safekeeper-extra-opt")
-        .into_iter()
-        .flatten()
-        .map(|s| s.to_owned())
-        .collect()
-}
+async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) -> Result<()> {
+    match subcmd {
+        SafekeeperCmd::Start(args) => {
+            let safekeeper = get_safekeeper(env, args.id)?;
 
-async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let (sub_name, sub_args) = match sub_match.subcommand() {
-        Some(safekeeper_command_data) => safekeeper_command_data,
-        None => bail!("no safekeeper subcommand provided"),
-    };
-
-    // All the commands take an optional safekeeper name argument
-    let sk_id = if let Some(id_str) = sub_args.get_one::<String>("id") {
-        NodeId(id_str.parse().context("while parsing safekeeper id")?)
-    } else {
-        DEFAULT_SAFEKEEPER_ID
-    };
-    let safekeeper = get_safekeeper(env, sk_id)?;
-
-    match sub_name {
-        "start" => {
-            let extra_opts = safekeeper_extra_opts(sub_args);
-
-            if let Err(e) = safekeeper
-                .start(extra_opts, get_start_timeout(sub_args))
-                .await
-            {
+            if let Err(e) = safekeeper.start(&args.extra_opt, &args.start_timeout).await {
                 eprintln!("safekeeper start failed: {}", e);
                 exit(1);
             }
         }
 
-        "stop" => {
-            let immediate =
-                sub_args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
-
+        SafekeeperCmd::Stop(args) => {
+            let safekeeper = get_safekeeper(env, args.id)?;
+            let immediate = match args.stop_mode {
+                StopMode::Fast => false,
+                StopMode::Immediate => true,
+            };
             if let Err(e) = safekeeper.stop(immediate) {
                 eprintln!("safekeeper stop failed: {}", e);
                 exit(1);
             }
         }
 
-        "restart" => {
-            let immediate =
-                sub_args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
+        SafekeeperCmd::Restart(args) => {
+            let safekeeper = get_safekeeper(env, args.id)?;
+            let immediate = match args.stop_mode {
+                StopMode::Fast => false,
+                StopMode::Immediate => true,
+            };
 
             if let Err(e) = safekeeper.stop(immediate) {
                 eprintln!("safekeeper stop failed: {}", e);
                 exit(1);
             }
 
-            let extra_opts = safekeeper_extra_opts(sub_args);
-            if let Err(e) = safekeeper
-                .start(extra_opts, get_start_timeout(sub_args))
-                .await
-            {
+            if let Err(e) = safekeeper.start(&args.extra_opt, &args.start_timeout).await {
                 eprintln!("safekeeper start failed: {}", e);
                 exit(1);
             }
         }
-
-        _ => {
-            bail!("Unexpected safekeeper subcommand '{}'", sub_name)
-        }
     }
     Ok(())
 }
 
-async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let (sub_name, sub_args) = match sub_match.subcommand() {
-        Some(broker_command_data) => broker_command_data,
-        None => bail!("no broker subcommand provided"),
-    };
-
-    match sub_name {
-        "start" => {
-            if let Err(e) = broker::start_broker_process(env, get_start_timeout(sub_args)).await {
+async fn handle_storage_broker(subcmd: &StorageBrokerCmd, env: &local_env::LocalEnv) -> Result<()> {
+    match subcmd {
+        StorageBrokerCmd::Start(args) => {
+            if let Err(e) = broker::start_broker_process(env, &args.start_timeout).await {
                 eprintln!("broker start failed: {e}");
                 exit(1);
             }
         }
 
-        "stop" => {
+        StorageBrokerCmd::Stop(_args) => {
+            // FIXME: stop_mode unused
             if let Err(e) = broker::stop_broker_process(env) {
                 eprintln!("broker stop failed: {e}");
                 exit(1);
             }
         }
-
-        _ => bail!("Unexpected broker subcommand '{}'", sub_name),
     }
     Ok(())
 }
 
 async fn handle_start_all(
+    args: &StartCmdArgs,
     env: &'static local_env::LocalEnv,
-    retry_timeout: &Duration,
 ) -> anyhow::Result<()> {
-    let Err(errors) = handle_start_all_impl(env, *retry_timeout).await else {
-        neon_start_status_check(env, retry_timeout)
+    // FIXME: this was called "retry_timeout", is it right?
+    let Err(errors) = handle_start_all_impl(env, args.timeout).await else {
+        neon_start_status_check(env, args.timeout.as_ref())
             .await
             .context("status check after successful startup of all services")?;
         return Ok(());
@@ -1314,7 +1699,7 @@ async fn handle_start_all(
 /// Otherwise, returns the list of errors that occurred during startup.
 async fn handle_start_all_impl(
     env: &'static local_env::LocalEnv,
-    retry_timeout: Duration,
+    retry_timeout: humantime::Duration,
 ) -> Result<(), Vec<anyhow::Error>> {
     // Endpoints are not started automatically
 
@@ -1334,7 +1719,7 @@ async fn handle_start_all_impl(
                 let storage_controller = StorageController::from_env(env);
                 storage_controller
                     .start(NeonStorageControllerStartArgs::with_default_instance_id(
-                        retry_timeout.into(),
+                        retry_timeout,
                     ))
                     .await
                     .map_err(|e| e.context("start storage_controller"))
@@ -1355,7 +1740,7 @@ async fn handle_start_all_impl(
             js.spawn(async move {
                 let safekeeper = SafekeeperNode::from_env(env, node);
                 safekeeper
-                    .start(vec![], &retry_timeout)
+                    .start(&[], &retry_timeout)
                     .await
                     .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id)))
             });
@@ -1435,9 +1820,11 @@ async fn neon_start_status_check(
     anyhow::bail!("\nNeon passed status check")
 }
 
-async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let immediate =
-        sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
+async fn handle_stop_all(args: &StopCmdArgs, env: &local_env::LocalEnv) -> Result<()> {
+    let immediate = match args.mode {
+        StopMode::Fast => false,
+        StopMode::Immediate => true,
+    };
 
     try_stop_all(env, immediate).await;
 
@@ -1495,400 +1882,3 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
         }
     }
 }
-
-fn cli() -> Command {
-    let timeout_arg = Arg::new("start-timeout")
-        .long("start-timeout")
-        .short('t')
-        .global(true)
-        .help("timeout until we fail the command, e.g. 30s")
-        .value_parser(value_parser!(humantime::Duration))
-        .default_value("10s")
-        .required(false);
-
-    let branch_name_arg = Arg::new("branch-name")
-        .long("branch-name")
-        .help("Name of the branch to be created or used as an alias for other services")
-        .required(false);
-
-    let endpoint_id_arg = Arg::new("endpoint_id")
-        .help("Postgres endpoint id")
-        .required(false);
-
-    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
-
-    // --id, when using a pageserver command
-    let pageserver_id_arg = Arg::new("pageserver-id")
-        .long("id")
-        .global(true)
-        .help("pageserver id")
-        .required(false);
-    // --pageserver-id when using a non-pageserver command
-    let endpoint_pageserver_id_arg = Arg::new("endpoint-pageserver-id")
-        .long("pageserver-id")
-        .required(false);
-
-    let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
-        .short('e')
-        .long("safekeeper-extra-opt")
-        .num_args(1)
-        .action(ArgAction::Append)
-        .help("Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo")
-        .required(false);
-
-    let tenant_id_arg = Arg::new("tenant-id")
-        .long("tenant-id")
-        .help("Tenant id. Represented as a hexadecimal string 32 symbols length")
-        .required(false);
-
-    let timeline_id_arg = Arg::new("timeline-id")
-        .long("timeline-id")
-        .help("Timeline id. Represented as a hexadecimal string 32 symbols length")
-        .required(false);
-
-    let pg_version_arg = Arg::new("pg-version")
-        .long("pg-version")
-        .help("Postgres version to use for the initial tenant")
-        .required(false)
-        .value_parser(value_parser!(u32))
-        .default_value(DEFAULT_PG_VERSION);
-
-    let pg_port_arg = Arg::new("pg-port")
-        .long("pg-port")
-        .required(false)
-        .value_parser(value_parser!(u16))
-        .value_name("pg-port");
-
-    let http_port_arg = Arg::new("http-port")
-        .long("http-port")
-        .required(false)
-        .value_parser(value_parser!(u16))
-        .value_name("http-port");
-
-    let safekeepers_arg = Arg::new("safekeepers")
-        .long("safekeepers")
-        .required(false)
-        .value_name("safekeepers");
-
-    let stop_mode_arg = Arg::new("stop-mode")
-        .short('m')
-        .value_parser(["fast", "immediate"])
-        .default_value("fast")
-        .help("If 'immediate', don't flush repository data at shutdown")
-        .required(false)
-        .value_name("stop-mode");
-
-    let remote_ext_config_args = Arg::new("remote-ext-config")
-        .long("remote-ext-config")
-        .num_args(1)
-        .help("Configure the remote extensions storage proxy gateway to request for extensions.")
-        .required(false);
-
-    let lsn_arg = Arg::new("lsn")
-        .long("lsn")
-        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
-        .required(false);
-
-    let hot_standby_arg = Arg::new("hot-standby")
-        .value_parser(value_parser!(bool))
-        .long("hot-standby")
-        .help("If set, the node will be a hot replica on the specified timeline")
-        .required(false);
-
-    let force_arg = Arg::new("force")
-        .value_parser(value_parser!(InitForceMode))
-        .long("force")
-        .default_value(
-            InitForceMode::MustNotExist
-                .to_possible_value()
-                .unwrap()
-                .get_name()
-                .to_owned(),
-        )
-        .help("Force initialization even if the repository is not empty")
-        .required(false);
-
-    let num_pageservers_arg = Arg::new("num-pageservers")
-        .value_parser(value_parser!(u16))
-        .long("num-pageservers")
-        .help("How many pageservers to create (default 1)");
-
-    let update_catalog = Arg::new("update-catalog")
-        .value_parser(value_parser!(bool))
-        .long("update-catalog")
-        .help("If set, will set up the catalog for neon_superuser")
-        .required(false);
-
-    let create_test_user = Arg::new("create-test-user")
-        .value_parser(value_parser!(bool))
-        .long("create-test-user")
-        .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
-        .required(false);
-
-    let allow_multiple = Arg::new("allow-multiple")
-        .help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.")
-        .long("allow-multiple")
-        .action(ArgAction::SetTrue)
-        .required(false);
-
-    let instance_id = Arg::new("instance-id")
-        .long("instance-id")
-        .help("Identifier used to distinguish storage controller instances (default 1)")
-        .value_parser(value_parser!(u8))
-        .required(false);
-
-    let base_port = Arg::new("base-port")
-        .long("base-port")
-        .help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)")
-        .value_parser(value_parser!(u16))
-        .required(false);
-
-    Command::new("Neon CLI")
-        .arg_required_else_help(true)
-        .version(GIT_VERSION)
-        .subcommand(
-            Command::new("init")
-                .about("Initialize a new Neon repository, preparing configs for services to start with")
-                .arg(num_pageservers_arg.clone())
-                .arg(
-                    Arg::new("config")
-                        .long("config")
-                        .required(false)
-                        .value_parser(value_parser!(PathBuf))
-                        .value_name("config")
-                )
-                .arg(force_arg)
-        )
-        .subcommand(
-            Command::new("timeline")
-            .about("Manage timelines")
-            .arg_required_else_help(true)
-            .subcommand(Command::new("list")
-                .about("List all timelines, available to this pageserver")
-                .arg(tenant_id_arg.clone()))
-            .subcommand(Command::new("branch")
-                .about("Create a new timeline, using another timeline as a base, copying its data")
-                .arg(tenant_id_arg.clone())
-                .arg(timeline_id_arg.clone())
-                .arg(branch_name_arg.clone())
-                .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name")
-                    .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
-                .arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn")
-                    .help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false)))
-            .subcommand(Command::new("create")
-                .about("Create a new blank timeline")
-                .arg(tenant_id_arg.clone())
-                .arg(timeline_id_arg.clone())
-                .arg(branch_name_arg.clone())
-                .arg(pg_version_arg.clone())
-            )
-            .subcommand(Command::new("import")
-                .about("Import timeline from basebackup directory")
-                .arg(tenant_id_arg.clone())
-                .arg(timeline_id_arg.clone())
-                .arg(branch_name_arg.clone())
-                .arg(Arg::new("base-tarfile")
-                    .long("base-tarfile")
-                    .value_parser(value_parser!(PathBuf))
-                    .help("Basebackup tarfile to import")
-                )
-                .arg(Arg::new("base-lsn").long("base-lsn")
-                    .help("Lsn the basebackup starts at"))
-                .arg(Arg::new("wal-tarfile")
-                    .long("wal-tarfile")
-                    .value_parser(value_parser!(PathBuf))
-                    .help("Wal to add after base")
-                )
-                .arg(Arg::new("end-lsn").long("end-lsn")
-                    .help("Lsn the basebackup ends at"))
-                .arg(pg_version_arg.clone())
-            )
-        ).subcommand(
-            Command::new("tenant")
-            .arg_required_else_help(true)
-            .about("Manage tenants")
-            .subcommand(Command::new("list"))
-            .subcommand(Command::new("create")
-                .arg(tenant_id_arg.clone())
-                .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
-                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
-                .arg(pg_version_arg.clone())
-                .arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
-                    .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
-                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
-                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
-                .arg(Arg::new("placement-policy").value_parser(value_parser!(String)).long("placement-policy").action(ArgAction::Set).help("Placement policy shards in this tenant"))
-                )
-            .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
-                .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
-            .subcommand(Command::new("config")
-                .arg(tenant_id_arg.clone())
-                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
-            .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
-                .about("Import a tenant that is present in remote storage, and create branches for its timelines"))
-        )
-        .subcommand(
-            Command::new("pageserver")
-                .arg_required_else_help(true)
-                .about("Manage pageserver")
-                .arg(pageserver_id_arg)
-                .subcommand(Command::new("status"))
-                .subcommand(Command::new("start")
-                    .about("Start local pageserver")
-                    .arg(timeout_arg.clone())
-                )
-                .subcommand(Command::new("stop")
-                    .about("Stop local pageserver")
-                    .arg(stop_mode_arg.clone())
-                )
-                .subcommand(Command::new("restart")
-                    .about("Restart local pageserver")
-                    .arg(timeout_arg.clone())
-                )
-        )
-        .subcommand(
-            Command::new("storage_controller")
-                .arg_required_else_help(true)
-                .about("Manage storage_controller")
-                .subcommand(Command::new("start").about("Start storage controller")
-                            .arg(timeout_arg.clone())
-                            .arg(instance_id.clone())
-                            .arg(base_port))
-                .subcommand(Command::new("stop").about("Stop storage controller")
-                            .arg(stop_mode_arg.clone())
-                            .arg(instance_id))
-        )
-        .subcommand(
-            Command::new("storage_broker")
-                .arg_required_else_help(true)
-                .about("Manage broker")
-                .subcommand(Command::new("start")
-                            .about("Start broker")
-                            .arg(timeout_arg.clone())
-                )
-                .subcommand(Command::new("stop")
-                            .about("Stop broker")
-                            .arg(stop_mode_arg.clone())
-                )
-        )
-        .subcommand(
-            Command::new("safekeeper")
-                .arg_required_else_help(true)
-                .about("Manage safekeepers")
-                .subcommand(Command::new("start")
-                            .about("Start local safekeeper")
-                            .arg(safekeeper_id_arg.clone())
-                            .arg(safekeeper_extra_opt_arg.clone())
-                            .arg(timeout_arg.clone())
-                )
-                .subcommand(Command::new("stop")
-                            .about("Stop local safekeeper")
-                            .arg(safekeeper_id_arg.clone())
-                            .arg(stop_mode_arg.clone())
-                )
-                .subcommand(Command::new("restart")
-                            .about("Restart local safekeeper")
-                            .arg(safekeeper_id_arg)
-                            .arg(stop_mode_arg.clone())
-                            .arg(safekeeper_extra_opt_arg)
-                            .arg(timeout_arg.clone())
-                )
-        )
-        .subcommand(
-            Command::new("endpoint")
-                .arg_required_else_help(true)
-                .about("Manage postgres instances")
-                .subcommand(Command::new("list").arg(tenant_id_arg.clone()))
-                .subcommand(Command::new("create")
-                    .about("Create a compute endpoint")
-                    .arg(endpoint_id_arg.clone())
-                    .arg(branch_name_arg.clone())
-                    .arg(tenant_id_arg.clone())
-                    .arg(lsn_arg.clone())
-                    .arg(pg_port_arg.clone())
-                    .arg(http_port_arg.clone())
-                    .arg(endpoint_pageserver_id_arg.clone())
-                    .arg(
-                        Arg::new("config-only")
-                            .help("Don't do basebackup, create endpoint directory with only config files")
-                            .long("config-only")
-                            .required(false))
-                    .arg(pg_version_arg.clone())
-                    .arg(hot_standby_arg.clone())
-                    .arg(update_catalog)
-                    .arg(allow_multiple.clone())
-                )
-                .subcommand(Command::new("start")
-                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
-                    .arg(endpoint_id_arg.clone())
-                    .arg(endpoint_pageserver_id_arg.clone())
-                    .arg(safekeepers_arg.clone())
-                    .arg(remote_ext_config_args)
-                    .arg(create_test_user)
-                    .arg(allow_multiple.clone())
-                    .arg(timeout_arg.clone())
-                )
-                .subcommand(Command::new("reconfigure")
-                            .about("Reconfigure the endpoint")
-                            .arg(endpoint_pageserver_id_arg)
-                            .arg(safekeepers_arg)
-                            .arg(endpoint_id_arg.clone())
-                            .arg(tenant_id_arg.clone())
-                )
-                .subcommand(
-                    Command::new("stop")
-                    .arg(endpoint_id_arg)
-                    .arg(
-                        Arg::new("destroy")
-                            .help("Also delete data directory (now optional, should be default in future)")
-                            .long("destroy")
-                            .action(ArgAction::SetTrue)
-                            .required(false)
-                    )
-                    .arg(
-                        Arg::new("mode")
-                            .help("Postgres shutdown mode, passed to \"pg_ctl -m <mode>\"")
-                            .long("mode")
-                            .action(ArgAction::Set)
-                            .required(false)
-                            .value_parser(["smart", "fast", "immediate"])
-                            .default_value("fast")
-                    )
-                )
-
-        )
-        .subcommand(
-            Command::new("mappings")
-                .arg_required_else_help(true)
-                .about("Manage neon_local branch name mappings")
-                .subcommand(
-                    Command::new("map")
-                        .about("Create new mapping which cannot exist already")
-                        .arg(branch_name_arg.clone())
-                        .arg(tenant_id_arg.clone())
-                        .arg(timeline_id_arg.clone())
-                )
-        )
-        // Obsolete old name for 'endpoint'. We now just print an error if it's used.
-        .subcommand(
-            Command::new("pg")
-                .hide(true)
-                .arg(Arg::new("ignore-rest").allow_hyphen_values(true).num_args(0..).required(false))
-                .trailing_var_arg(true)
-        )
-        .subcommand(
-            Command::new("start")
-                .about("Start page server and safekeepers")
-                .arg(timeout_arg.clone())
-        )
-        .subcommand(
-            Command::new("stop")
-                .about("Stop page server and safekeepers")
-                .arg(stop_mode_arg)
-        )
-}
-
-#[test]
-fn verify_cli() {
-    cli().debug_assert();
-}
diff --git a/control_plane/src/branch_mappings.rs b/control_plane/src/branch_mappings.rs
new file mode 100644
index 0000000000..e89313df39
--- /dev/null
+++ b/control_plane/src/branch_mappings.rs
@@ -0,0 +1,94 @@
+//! Branch mappings for convenience
+
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+
+use anyhow::{bail, Context};
+use serde::{Deserialize, Serialize};
+
+use utils::id::{TenantId, TenantTimelineId, TimelineId};
+
+/// Keep human-readable aliases in memory (and persist them to config XXX), to hide tenant/timeline hex strings from the user.
+#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)]
+#[serde(default, deny_unknown_fields)]
+pub struct BranchMappings {
+    /// Default tenant ID to use with the 'neon_local' command line utility, when
+    /// --tenant_id is not explicitly specified. This comes from the branches.
+    pub default_tenant_id: Option<TenantId>,
+
+    // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
+    // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
+    // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
+    pub mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
+}
+
+impl BranchMappings {
+    pub fn register_branch_mapping(
+        &mut self,
+        branch_name: String,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<()> {
+        let existing_values = self.mappings.entry(branch_name.clone()).or_default();
+
+        let existing_ids = existing_values
+            .iter()
+            .find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id);
+
+        if let Some((_, old_timeline_id)) = existing_ids {
+            if old_timeline_id == &timeline_id {
+                Ok(())
+            } else {
+                bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}");
+            }
+        } else {
+            existing_values.push((tenant_id, timeline_id));
+            Ok(())
+        }
+    }
+
+    pub fn get_branch_timeline_id(
+        &self,
+        branch_name: &str,
+        tenant_id: TenantId,
+    ) -> Option<TimelineId> {
+        // If it looks like a timeline ID, return it as it is
+        if let Ok(timeline_id) = branch_name.parse::<TimelineId>() {
+            return Some(timeline_id);
+        }
+
+        self.mappings
+            .get(branch_name)?
+            .iter()
+            .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
+            .map(|&(_, timeline_id)| timeline_id)
+            .map(TimelineId::from)
+    }
+
+    pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {
+        self.mappings
+            .iter()
+            .flat_map(|(name, tenant_timelines)| {
+                tenant_timelines.iter().map(|&(tenant_id, timeline_id)| {
+                    (TenantTimelineId::new(tenant_id, timeline_id), name.clone())
+                })
+            })
+            .collect()
+    }
+
+    pub fn persist(&self, path: &Path) -> anyhow::Result<()> {
+        let content = &toml::to_string_pretty(self)?;
+        fs::write(path, content).with_context(|| {
+            format!(
+                "Failed to write branch information into path '{}'",
+                path.display()
+            )
+        })
+    }
+
+    pub fn load(path: &Path) -> anyhow::Result<BranchMappings> {
+        let branches_file_contents = fs::read_to_string(path)?;
+        Ok(toml::from_str(branches_file_contents.as_str())?)
+    }
+}
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index 573f1688d5..7a019bce88 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -113,7 +113,7 @@ impl SafekeeperNode {
 
     pub async fn start(
         &self,
-        extra_opts: Vec<String>,
+        extra_opts: &[String],
         retry_timeout: &Duration,
     ) -> anyhow::Result<()> {
         print!(
@@ -196,7 +196,7 @@ impl SafekeeperNode {
             ]);
         }
 
-        args.extend(extra_opts);
+        args.extend_from_slice(extra_opts);
 
         background_process::start_process(
             &format!("safekeeper-{id}"),

From ba498a630a70a3ba22a9ab70a6e42da5f82889a8 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 27 Sep 2024 20:52:22 +0100
Subject: [PATCH 27/29] Set disk quotas on bind in compute_ctl (#8936)

Part of https://github.com/neondatabase/cloud/issues/13127. Resolves
#9153

What changed in this PR:
1. Adds `ComputeSpec.disk_quota_bytes: Option<u64>`
2. Adds new arg to compute_ctl: `--set-disk-quota-for-fs <mountpoint>`
3. Implements running `/neonvm/bin/set-disk-quota` with the right value
if both cmdline arg AND field in the spec are specified
4. Patches `/etc/sudoers.d` to allow `compute_ctl` to set quota with
sudo

This PR is very similar to the swap support added earlier, you can take
a look at it as prior art: #7434

In theory, it can be implemented outside of compute_ctl when we will
have a separate neonvm daemon, but we are not there yet. Current
implementation is the simplest possible to unblock computes with larger
disks.

All code related to usage of `/neonvm/bin/set-disk-quota` is located in
`disk_quota.rs`. We need to call this script with the following
arguments: `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}`. Quotas
are set on the filesystem level, so we need to provide path to the
directory that filesystem was mounted to.

I tested this change locally with
https://github.com/neondatabase/cloud/pull/17270. It should be safe to
merge, because this feature is gated by both cmdline arg and field in
the spec. If control-plane doesn't set values in both places,
compute_ctl won't be affected by this change.
---
 compute/vm-image-spec.yaml           | 13 +++++--
 compute_tools/src/bin/compute_ctl.rs | 56 ++++++++++++++++++++--------
 compute_tools/src/compute.rs         |  7 ++++
 compute_tools/src/disk_quota.rs      | 25 +++++++++++++
 compute_tools/src/lib.rs             |  1 +
 control_plane/src/endpoint.rs        |  1 +
 libs/compute_api/src/spec.rs         | 10 +++++
 7 files changed, 93 insertions(+), 20 deletions(-)
 create mode 100644 compute_tools/src/disk_quota.rs

diff --git a/compute/vm-image-spec.yaml b/compute/vm-image-spec.yaml
index 0af44745e5..50fcd62e4f 100644
--- a/compute/vm-image-spec.yaml
+++ b/compute/vm-image-spec.yaml
@@ -11,6 +11,10 @@ commands:
     user: root
     sysvInitAction: sysinit
     shell: 'chmod 711 /neonvm/bin/resize-swap'
+  - name: chmod-set-disk-quota
+    user: root
+    sysvInitAction: sysinit
+    shell: 'chmod 711 /neonvm/bin/set-disk-quota'
   - name: pgbouncer
     user: postgres
     sysvInitAction: respawn
@@ -30,11 +34,12 @@ commands:
 shutdownHook: |
   su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:
-  - filename: compute_ctl-resize-swap
+  - filename: compute_ctl-sudoers
     content: |
       # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
-      # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL)
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap
+      # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
+      # regardless of hostname (ALL)
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
   - filename: cgconfig.conf
     content: |
       # Configuration for cgroups in VM compute nodes
@@ -100,7 +105,7 @@ merge: |
       && apt install --no-install-recommends -y \
              sudo \
       && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-  COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap
+  COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers
 
   COPY cgconfig.conf /etc/cgconfig.conf
 
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 9499a7186e..b10638c454 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -44,6 +44,7 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
+use compute_tools::disk_quota::set_disk_quota;
 use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
@@ -151,6 +152,7 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
     let spec_json = matches.get_one::<String>("spec");
     let spec_path = matches.get_one::<String>("spec-path");
     let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
+    let set_disk_quota_for_fs = matches.get_one::<String>("set-disk-quota-for-fs");
 
     Ok(ProcessCliResult {
         connstr,
@@ -161,6 +163,7 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
         spec_json,
         spec_path,
         resize_swap_on_bind,
+        set_disk_quota_for_fs,
     })
 }
 
@@ -173,6 +176,7 @@ struct ProcessCliResult<'clap> {
     spec_json: Option<&'clap String>,
     spec_path: Option<&'clap String>,
     resize_swap_on_bind: bool,
+    set_disk_quota_for_fs: Option<&'clap String>,
 }
 
 fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
@@ -293,6 +297,7 @@ fn wait_spec(
         pgbin,
         ext_remote_storage,
         resize_swap_on_bind,
+        set_disk_quota_for_fs,
         http_port,
         ..
     }: ProcessCliResult,
@@ -373,6 +378,7 @@ fn wait_spec(
         compute,
         http_port,
         resize_swap_on_bind,
+        set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
     })
 }
 
@@ -381,6 +387,7 @@ struct WaitSpecResult {
     // passed through from ProcessCliResult
     http_port: u16,
     resize_swap_on_bind: bool,
+    set_disk_quota_for_fs: Option<String>,
 }
 
 fn start_postgres(
@@ -390,6 +397,7 @@ fn start_postgres(
         compute,
         http_port,
         resize_swap_on_bind,
+        set_disk_quota_for_fs,
     }: WaitSpecResult,
 ) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
     // We got all we need, update the state.
@@ -403,6 +411,7 @@ fn start_postgres(
     );
     // before we release the mutex, fetch the swap size (if any) for later.
     let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
+    let disk_quota_bytes = state.pspec.as_ref().unwrap().spec.disk_quota_bytes;
     drop(state);
 
     // Launch remaining service threads
@@ -422,8 +431,8 @@ fn start_postgres(
         // OOM-killed during startup because swap wasn't available yet.
         match resize_swap(size_bytes) {
             Ok(()) => {
-                let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
-                info!(%size_bytes, %size_gib, "resized swap");
+                let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
+                info!(%size_bytes, %size_mib, "resized swap");
             }
             Err(err) => {
                 let err = err.context("failed to resize swap");
@@ -432,10 +441,29 @@ fn start_postgres(
                 // Mark compute startup as failed; don't try to start postgres, and report this
                 // error to the control plane when it next asks.
                 prestartup_failed = true;
-                let mut state = compute.state.lock().unwrap();
-                state.error = Some(format!("{err:?}"));
-                state.status = ComputeStatus::Failed;
-                compute.state_changed.notify_all();
+                compute.set_failed_status(err);
+                delay_exit = true;
+            }
+        }
+    }
+
+    // Set disk quota if the compute spec says so
+    if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
+        (disk_quota_bytes, set_disk_quota_for_fs)
+    {
+        match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) {
+            Ok(()) => {
+                let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
+                info!(%disk_quota_bytes, %size_mib, "set disk quota");
+            }
+            Err(err) => {
+                let err = err.context("failed to set disk quota");
+                error!("{err:#}");
+
+                // Mark compute startup as failed; don't try to start postgres, and report this
+                // error to the control plane when it next asks.
+                prestartup_failed = true;
+                compute.set_failed_status(err);
                 delay_exit = true;
             }
         }
@@ -450,16 +478,7 @@ fn start_postgres(
             Ok(pg) => Some(pg),
             Err(err) => {
                 error!("could not start the compute node: {:#}", err);
-                let mut state = compute.state.lock().unwrap();
-                state.error = Some(format!("{:?}", err));
-                state.status = ComputeStatus::Failed;
-                // Notify others that Postgres failed to start. In case of configuring the
-                // empty compute, it's likely that API handler is still waiting for compute
-                // state change. With this we will notify it that compute is in Failed state,
-                // so control plane will know about it earlier and record proper error instead
-                // of timeout.
-                compute.state_changed.notify_all();
-                drop(state); // unlock
+                compute.set_failed_status(err);
                 delay_exit = true;
                 None
             }
@@ -750,6 +769,11 @@ fn cli() -> clap::Command {
                 .long("resize-swap-on-bind")
                 .action(clap::ArgAction::SetTrue),
         )
+        .arg(
+            Arg::new("set-disk-quota-for-fs")
+                .long("set-disk-quota-for-fs")
+                .value_name("SET_DISK_QUOTA_FOR_FS")
+        )
 }
 
 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index a6b7633eda..147eb2a161 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -306,6 +306,13 @@ impl ComputeNode {
         self.state_changed.notify_all();
     }
 
+    pub fn set_failed_status(&self, err: anyhow::Error) {
+        let mut state = self.state.lock().unwrap();
+        state.error = Some(format!("{err:?}"));
+        state.status = ComputeStatus::Failed;
+        self.state_changed.notify_all();
+    }
+
     pub fn get_status(&self) -> ComputeStatus {
         self.state.lock().unwrap().status
     }
diff --git a/compute_tools/src/disk_quota.rs b/compute_tools/src/disk_quota.rs
new file mode 100644
index 0000000000..e838c5b9fd
--- /dev/null
+++ b/compute_tools/src/disk_quota.rs
@@ -0,0 +1,25 @@
+use anyhow::Context;
+
+pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota";
+
+/// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes.
+/// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set.
+pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> {
+    let size_kb = size_bytes / 1024;
+    // run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}`
+    let child_result = std::process::Command::new("/usr/bin/sudo")
+        .arg(DISK_QUOTA_BIN)
+        .arg(size_kb.to_string())
+        .arg(fs_mountpoint)
+        .spawn();
+
+    child_result
+        .context("spawn() failed")
+        .and_then(|mut child| child.wait().context("wait() failed"))
+        .and_then(|status| match status.success() {
+            true => Ok(()),
+            false => Err(anyhow::anyhow!("process exited with {status}")),
+        })
+        // wrap any prior error with the overall context that we couldn't run the command
+        .with_context(|| format!("could not run `/usr/bin/sudo {DISK_QUOTA_BIN}`"))
+}
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index c402d63305..c5b4ca632c 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -10,6 +10,7 @@ pub mod http;
 pub mod logger;
 pub mod catalog;
 pub mod compute;
+pub mod disk_quota;
 pub mod extension_server;
 pub mod lsn_lease;
 mod migration;
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 7554a03a68..18f396b886 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -561,6 +561,7 @@ impl Endpoint {
             operation_uuid: None,
             features: self.features.clone(),
             swap_size_bytes: None,
+            disk_quota_bytes: None,
             cluster: Cluster {
                 cluster_id: None, // project ID: not used
                 name: None,       // project name: not used
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 525a1572ff..83515a00a0 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -50,6 +50,16 @@ pub struct ComputeSpec {
     #[serde(default)]
     pub swap_size_bytes: Option<u64>,
 
+    /// If compute_ctl was passed `--set-disk-quota-for-fs`, a value of `Some(_)` instructs
+    /// compute_ctl to run `/neonvm/bin/set-disk-quota` with the given size and fs, when the
+    /// spec is first received.
+    ///
+    /// Both this field and `--set-disk-quota-for-fs` are required, so that the control plane's
+    /// spec generation doesn't need to be aware of the actual compute it's running on, while
+    /// guaranteeing gradual rollout of disk quota.
+    #[serde(default)]
+    pub disk_quota_bytes: Option<u64>,
+
     /// Expected cluster state at the end of transition process.
     pub cluster: Cluster,
     pub delta_operations: Option<Vec<DeltaOp>>,

From d2d992176159732e77f46d3d302981aa95c7cd8f Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sat, 28 Sep 2024 02:44:22 +0100
Subject: [PATCH 28/29] CI(benchmarking): fix Nightly Benchmarks (#9178)

## Problem

Nightly Benchmarks have been broken for some time due to various
reasons, this PR fixes it

## Summary of changes
- Pull `build-tools` image from dockerhub for `benchmarking` workflow
- Use `aws-actions/configure-aws-credentials` to upload/download
artifacts from S3
- Fix Postgres 16 installation (for pgbench)
---
 .../workflows/_benchmarking_preparation.yml   |  46 ++++---
 .github/workflows/benchmarking.yml            | 119 ++++++++++++++----
 2 files changed, 122 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml
index a52e43b4da..d60f97320b 100644
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -3,19 +3,23 @@ name: Prepare benchmarking databases by restoring dumps
 on:
   workflow_call:
     # no inputs needed
-    
+
 defaults:
   run:
     shell: bash -euxo pipefail {0}
 
 jobs:
   setup-databases:
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
     strategy:
       fail-fast: false
       matrix:
-        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] 
+        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ]
         database: [ clickbench, tpch, userexample ]
-  
+
     env:
       LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib
       PLATFORM: ${{ matrix.platform }}
@@ -23,7 +27,10 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
@@ -32,13 +39,13 @@ jobs:
       run: |
         case "${PLATFORM}" in
           neon)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} 
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
             ;;
           aws-rds-postgres)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} 
+            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
             ;;
           aws-aurora-serverless-v2-postgres)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} 
+            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
             ;;
           *)
             echo >&2 "Unknown PLATFORM=${PLATFORM}"
@@ -46,10 +53,17 @@ jobs:
             ;;
         esac
 
-        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT  
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
     - uses: actions/checkout@v4
 
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -57,23 +71,23 @@ jobs:
         path: /tmp/neon/
         prefix: latest
 
-    # we create a table that has one row for each database that we want to restore with the status whether the restore is done    
+    # we create a table that has one row for each database that we want to restore with the status whether the restore is done
     - name: Create benchmark_restore_status table if it does not exist
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
         DATABASE_NAME: ${{ matrix.database }}
-      # to avoid a race condition of multiple jobs trying to create the table at the same time, 
+      # to avoid a race condition of multiple jobs trying to create the table at the same time,
       # we use an advisory lock
       run: |
         ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
-        SELECT pg_advisory_lock(4711);  
+        SELECT pg_advisory_lock(4711);
         CREATE TABLE IF NOT EXISTS benchmark_restore_status (
         databasename text primary key,
         restore_done boolean
         );
         SELECT pg_advisory_unlock(4711);
         "
-    
+
     - name: Check if restore is already done
       id: check-restore-done
       env:
@@ -107,7 +121,7 @@ jobs:
         DATABASE_NAME: ${{ matrix.database }}
       run: |
         mkdir -p /tmp/dumps
-        aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/ 
+        aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/
 
     - name: Replace database name in connection string
       if: steps.check-restore-done.outputs.skip != 'true'
@@ -126,17 +140,17 @@ jobs:
         else
           new_connstr="${base_connstr}/${DATABASE_NAME}"
         fi
-        echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT  
+        echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT
 
     - name: Restore dump
       if: steps.check-restore-done.outputs.skip != 'true'
       env:
         DATABASE_NAME: ${{ matrix.database }}
         DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }}
-        # the following works only with larger computes: 
+        # the following works only with larger computes:
         # PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
         # we add the || true because:
-        # the dumps were created with Neon and contain neon extensions that are not 
+        # the dumps were created with Neon and contain neon extensions that are not
         # available in RDS, so we will always report an error, but we can ignore it
       run: |
         ${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index a4a597acde..a0d9bdf07b 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -12,7 +12,6 @@ on:
     #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
     #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
     - cron:   '0 3 * * *' # run once a day, timezone is utc
-
   workflow_dispatch: # adds ability to run this manually
     inputs:
       region_id:
@@ -59,7 +58,7 @@ jobs:
     permissions:
       contents: write
       statuses: write
-      id-token: write # Required for OIDC authentication in azure runners
+      id-token: write # aws-actions/configure-aws-credentials
     strategy:
       fail-fast: false
       matrix:
@@ -68,12 +67,10 @@ jobs:
             PLATFORM: "neon-staging"
             region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
             RUNNER: [ self-hosted, us-east-2, x64 ]
-            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
           - DEFAULT_PG_VERSION: 16
             PLATFORM: "azure-staging"
             region_id: 'azure-eastus2'
             RUNNER: [ self-hosted, eastus2, x64 ]
-            IMAGE: neondatabase/build-tools:pinned
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "300"
       TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -86,7 +83,10 @@ jobs:
 
     runs-on: ${{ matrix.RUNNER }}
     container:
-      image: ${{ matrix.IMAGE }}
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
@@ -164,6 +164,10 @@ jobs:
 
   replication-tests:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
       DEFAULT_PG_VERSION: 16
@@ -174,12 +178,21 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
     - uses: actions/checkout@v4
 
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
 
     - name: Download Neon artifact
       uses: ./.github/actions/download
@@ -267,7 +280,7 @@ jobs:
         region_id_default=${{ env.DEFAULT_REGION_ID }}
         runner_default='["self-hosted", "us-east-2", "x64"]'
         runner_azure='["self-hosted", "eastus2", "x64"]'
-        image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned"
+        image_default="neondatabase/build-tools:pinned"
         matrix='{
           "pg_version" : [
             16
@@ -344,7 +357,7 @@ jobs:
     permissions:
       contents: write
       statuses: write
-      id-token: write # Required for OIDC authentication in azure runners
+      id-token: write # aws-actions/configure-aws-credentials
 
     strategy:
       fail-fast: false
@@ -371,7 +384,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
-    - name: Configure AWS credentials # necessary on Azure runners
+    - name: Configure AWS credentials
       uses: aws-actions/configure-aws-credentials@v4
       with:
         aws-region: eu-central-1
@@ -492,17 +505,15 @@ jobs:
     permissions:
       contents: write
       statuses: write
-      id-token: write # Required for OIDC authentication in azure runners
+      id-token: write # aws-actions/configure-aws-credentials
     strategy:
       fail-fast: false
       matrix:
         include:
           - PLATFORM: "neonvm-captest-pgvector"
             RUNNER: [ self-hosted, us-east-2, x64 ]
-            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
           - PLATFORM: "azure-captest-pgvector"
             RUNNER: [ self-hosted, eastus2, x64 ]
-            IMAGE: neondatabase/build-tools:pinned
 
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
@@ -511,13 +522,16 @@ jobs:
       DEFAULT_PG_VERSION: 16
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
-      LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu
+
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.PLATFORM }}
 
     runs-on: ${{ matrix.RUNNER }}
     container:
-      image: ${{ matrix.IMAGE }}
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
@@ -527,17 +541,26 @@ jobs:
     # instead of using Neon artifacts containing pgbench
     - name: Install postgresql-16 where pytest expects it
       run: |
+        # Just to make it easier to test things locally on macOS (with arm64)
+        arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
+
         cd /home/nonroot
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb 
-        dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.0-1.pgdg110+1_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110+2_${arch}.deb"
+        dpkg -x libpq5_17.0-1.pgdg110+1_${arch}.deb pg
+        dpkg -x postgresql-16_16.4-1.pgdg110+2_${arch}.deb pg
+        dpkg -x postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb pg
+
         mkdir -p /tmp/neon/pg_install/v16/bin
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench  
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql  
-        ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib 
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v16/bin/psql
+        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v16/lib
+
+        LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH}"
+        export LD_LIBRARY_PATH
+        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV}
+
         /tmp/neon/pg_install/v16/bin/pgbench --version
         /tmp/neon/pg_install/v16/bin/psql --version
 
@@ -559,7 +582,7 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-    - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
+    - name: Configure AWS credentials
       uses: aws-actions/configure-aws-credentials@v4
       with:
         aws-region: eu-central-1
@@ -620,6 +643,10 @@ jobs:
     # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
     # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
     if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
     needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ]
 
     strategy:
@@ -638,12 +665,22 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
     - uses: actions/checkout@v4
 
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -714,6 +751,10 @@ jobs:
     #
     # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
     if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
     needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ]
 
     strategy:
@@ -731,12 +772,22 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
     - uses: actions/checkout@v4
 
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -806,6 +857,10 @@ jobs:
 
   user-examples-compare:
     if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
     needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ]
 
     strategy:
@@ -822,12 +877,22 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
     - uses: actions/checkout@v4
 
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:

From 3c721920658ae5996b8c5653db8299f3ddd82605 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sun, 29 Sep 2024 23:27:53 +0100
Subject: [PATCH 29/29] CI(benchmarking): fix setting LD_LIBRARY_PATH (#9191)

## Problem

`pgbench-pgvector` job from Nightly Benchmarks fails with the error:

```
/__w/_temp/f45bc2eb-4c4c-4f0a-8030-99079303fa65.sh: line 17: LD_LIBRARY_PATH: unbound variable
```

## Summary of changes
- Fix `LD_LIBRARY_PATH: unbound variable` error in benchmarks
---
 .github/workflows/benchmarking.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index a0d9bdf07b..32806b89ab 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -557,7 +557,7 @@ jobs:
         ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v16/bin/psql
         ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v16/lib
 
-        LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH}"
+        LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}"
         export LD_LIBRARY_PATH
         echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV}