From 4476caf670c9e52480e93e39b570e3fc70c83d34 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 12 Aug 2024 09:17:31 +0100
Subject: [PATCH 01/62] CI: add `actions/set-docker-config-dir` to set
 DOCKER_CONFIG (#8676)

## Problem

In several workflows, we have repeating code which is separated into
two steps:
```bash
mkdir -p $(pwd)/.docker-custom
echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
...
rm -rf $(pwd)/.docker-custom
```

Such copy-paste is prone to errors; for example, in one case, instead of
`$(pwd)/.docker-custom`, we use `/tmp/.docker-custom`, which is shared
between workflows.

## Summary of changes
- Create a new action `actions/set-docker-config-dir`, which sets
`DOCKER_CONFIG` and deletes it in a Post action part
---
 .../actions/set-docker-config-dir/action.yml  | 36 +++++++++++++
 .github/workflows/build-build-tools-image.yml | 13 +----
 .github/workflows/build_and_test.yml          | 50 ++-----------------
 3 files changed, 41 insertions(+), 58 deletions(-)
 create mode 100644 .github/actions/set-docker-config-dir/action.yml

diff --git a/.github/actions/set-docker-config-dir/action.yml b/.github/actions/set-docker-config-dir/action.yml
new file mode 100644
index 0000000000..3ee8bec8c6
--- /dev/null
+++ b/.github/actions/set-docker-config-dir/action.yml
@@ -0,0 +1,36 @@
+name: "Set custom docker config directory"
+description: "Create a directory for docker config and set DOCKER_CONFIG"
+
+# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+runs:
+  using: "composite"
+  steps:
+  - name: Show warning on GitHub-hosted runners
+    if: runner.environment == 'github-hosted'
+    shell: bash -euo pipefail {0}
+    run: |
+      # Using the following environment variables to find a path to the workflow file
+      # ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch
+      # ${GITHUB_REPOSITORY}   - octocat/hello-world
+      # ${GITHUB_REF}          - refs/heads/my_branch
+      # From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables
+
+      filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"}
+      filename=${filename_with_ref%"@$GITHUB_REF"}
+
+      # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message
+      title='Unnecessary usage of `.github/actions/set-docker-config-dir`'
+      message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners'
+      echo "::warning file=${filename},title=${title}::${message}"
+
+  - uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7
+    env:
+      DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }}
+    with:
+      main: |
+        mkdir -p "${DOCKER_CONFIG}"
+        echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV
+      post: |
+        if [ -d "${DOCKER_CONFIG}" ]; then
+          rm -r "${DOCKER_CONFIG}"
+        fi
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 76fc58151a..f4f6e6971f 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -56,13 +56,7 @@ jobs:
 
       - uses: actions/checkout@v4
 
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p /tmp/.docker-custom
-          echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
-
+      - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/setup-buildx-action@v3
         with:
           cache-binary: false
@@ -89,11 +83,6 @@ jobs:
           cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
           tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
 
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf /tmp/.docker-custom
-
   merge-images:
     needs: [ build-image ]
     runs-on: ubuntu-22.04
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c7ae2aedd4..78f9f11a65 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -484,12 +484,7 @@ jobs:
           submodules: true
           fetch-depth: 0
 
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/setup-buildx-action@v3
         with:
           cache-binary: false
@@ -521,11 +516,6 @@ jobs:
           tags: |
             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
-
   neon-image:
     needs: [ neon-image-arch, tag ]
     runs-on: ubuntu-22.04
@@ -570,12 +560,7 @@ jobs:
           submodules: true
           fetch-depth: 0
 
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/setup-buildx-action@v3
         with:
           cache-binary: false
@@ -658,11 +643,6 @@ jobs:
           tags: |
             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
-
   compute-node-image:
     needs: [ compute-node-image-arch, tag ]
     runs-on: ubuntu-22.04
@@ -735,13 +715,7 @@ jobs:
           curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
           chmod +x vm-builder
 
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-
+      - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -764,11 +738,6 @@ jobs:
         run: |
           docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
 
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
-
   test-images:
     needs: [ check-permissions, tag, neon-image, compute-node-image ]
     strategy:
@@ -784,13 +753,7 @@ jobs:
         with:
           fetch-depth: 0
 
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-
+      - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -830,11 +793,6 @@ jobs:
           docker compose -f ./docker-compose/docker-compose.yml logs || 0
           docker compose -f ./docker-compose/docker-compose.yml down
 
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
-
   promote-images:
     permissions:
       contents: read  # This is required for actions/checkout

From a4eea5025c70d646f5a94481590177af3dfe7491 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 8 Aug 2024 20:01:55 +0300
Subject: [PATCH 02/62] Fix logical apply worker reporting of flush_lsn wrt
 sync replication.

It should take syncrep flush_lsn into account because WAL before it on endpoint
restart is lost, which makes replication miss some data if slot had already been
advanced too far. This commit adds test reproducing the issue and bumps
vendor/postgres to commit with the actual fix.
---
 control_plane/src/endpoint.rs                 | 11 +--
 pgxn/neon/walsender_hooks.c                   |  8 ++
 test_runner/fixtures/neon_fixtures.py         |  2 +-
 .../regress/test_logical_replication.py       | 89 +++++++++++++++++++
 vendor/postgres-v14                           |  2 +-
 vendor/postgres-v15                           |  2 +-
 vendor/postgres-v16                           |  2 +-
 vendor/revisions.json                         | 15 +++-
 8 files changed, 119 insertions(+), 12 deletions(-)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index f9bb2da7e7..9f879c4b08 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -824,11 +824,12 @@ impl Endpoint {
         // cleanup work to do after postgres stops, like syncing safekeepers,
         // etc.
         //
-        // If destroying, send it SIGTERM before waiting. Sometimes we do *not*
-        // want this cleanup: tests intentionally do stop when majority of
-        // safekeepers is down, so sync-safekeepers would hang otherwise. This
-        // could be a separate flag though.
-        self.wait_for_compute_ctl_to_exit(destroy)?;
+        // If destroying or stop mode is immediate, send it SIGTERM before
+        // waiting. Sometimes we do *not* want this cleanup: tests intentionally
+        // do stop when majority of safekeepers is down, so sync-safekeepers
+        // would hang otherwise. This could be a separate flag though.
+        let send_sigterm = destroy || mode == "immediate";
+        self.wait_for_compute_ctl_to_exit(send_sigterm)?;
         if destroy {
             println!(
                 "Destroying postgres data directory '{}'",
diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c
index 8f8d1dfc01..bd3856e9d9 100644
--- a/pgxn/neon/walsender_hooks.c
+++ b/pgxn/neon/walsender_hooks.c
@@ -20,6 +20,7 @@
 #include "utils/guc.h"
 #include "postmaster/interrupt.h"
 
+#include "neon.h"
 #include "neon_walreader.h"
 #include "walproposer.h"
 
@@ -181,6 +182,13 @@ NeonWALReadSegmentClose(XLogReaderState *xlogreader)
 void
 NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr)
 {
+	/*
+	 * If safekeepers are not configured, assume we don't need neon_walreader,
+	 * i.e. running neon fork locally.
+	 */
+	if (wal_acceptors_list[0] == '\0')
+		return;
+
 	if (!wal_reader)
 	{
 		XLogRecPtr	epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn);
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 844a23d327..4374e74a41 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4893,7 +4893,7 @@ def check_restored_datadir_content(
     assert (mismatch, error) == ([], [])
 
 
-def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -> Lsn:
+def logical_replication_sync(subscriber: PgProtocol, publisher: PgProtocol) -> Lsn:
     """Wait logical replication subscriber to sync with publisher."""
     publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
     while True:
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 66afe9ddfd..5a5d369a11 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -4,11 +4,13 @@ from random import choice
 from string import ascii_lowercase
 
 import pytest
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     AuxFileStore,
     NeonEnv,
     NeonEnvBuilder,
+    PgProtocol,
     logical_replication_sync,
     wait_for_last_flush_lsn,
 )
@@ -524,3 +526,90 @@ def test_replication_shutdown(neon_simple_env: NeonEnv):
             assert [r[0] for r in res] == [10, 20, 30, 40]
 
         wait_until(10, 0.5, check_that_changes_propagated)
+
+
+def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn:
+    """
+    Wait for logical replication subscriber reported flush_lsn to reach
+    pg_current_wal_flush_lsn on publisher. Note that this is somewhat unreliable
+    because for some WAL records like vacuum subscriber won't get any data at
+    all.
+    """
+    publisher_flush_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+
+    def check_caughtup():
+        res = publisher.safe_psql(
+            """
+select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication sr, pg_replication_slots s
+   where s.active_pid = sr.pid and s.slot_type = 'logical';
+                                  """
+        )[0]
+        sent_lsn, flush_lsn, curr_publisher_flush_lsn = Lsn(res[0]), Lsn(res[1]), Lsn(res[2])
+        log.info(
+            f"sent_lsn={sent_lsn}, flush_lsn={flush_lsn}, publisher_flush_lsn={curr_publisher_flush_lsn}, waiting flush_lsn to reach {publisher_flush_lsn}"
+        )
+        assert flush_lsn >= publisher_flush_lsn
+
+    wait_until(30, 0.5, check_caughtup)
+    return publisher_flush_lsn
+
+
+# Test that subscriber takes into account quorum committed flush_lsn in
+# flush_lsn reporting to publisher. Without this, it may ack too far, losing
+# data on restart because publisher advances START_REPLICATION position to the
+# confirmed_flush_lsn of the slot.
+def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg):
+    env = neon_simple_env
+    # use vanilla as publisher to allow writes on it when safekeeper is down
+    vanilla_pg.configure(
+        [
+            "wal_level = 'logical'",
+            # neon fork uses custom WAL records which won't work without extension installed with obscure
+            # ERROR:  resource manager with ID 134 not registered
+            # error.
+            "shared_preload_libraries = 'neon'",
+        ]
+    )
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create extension neon;")
+
+    env.neon_cli.create_branch("subscriber")
+    sub = env.endpoints.create("subscriber")
+    sub.start()
+
+    with vanilla_pg.cursor() as pcur:
+        with sub.cursor() as scur:
+            pcur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
+            pcur.execute("CREATE PUBLICATION pub FOR TABLE t")
+            scur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
+
+            pub_connstr = vanilla_pg.connstr().replace("'", "''")
+            log.info(f"pub connstr is {pub_connstr}, subscriber connstr {sub.connstr()}")
+            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_connstr}' PUBLICATION pub with (synchronous_commit=off)"
+            scur.execute(query)
+            time.sleep(2)  # let initial table sync complete
+
+    # stop safekeeper so it won't get any data
+    for sk in env.safekeepers:
+        sk.stop()
+    # and insert to publisher
+    with vanilla_pg.cursor() as pcur:
+        for i in range(0, 1000):
+            pcur.execute("INSERT into t values (%s, random()*100000)", (i,))
+    # wait until sub receives all data
+    logical_replication_sync(sub, vanilla_pg)
+    # Update confirmed_flush_lsn of the slot. If subscriber ack'ed recevied data
+    # as flushed we'll now lose it if subscriber restars. That's why
+    # logical_replication_wait_flush_lsn_sync is expected to hang while
+    # safekeeper is down.
+    vanilla_pg.safe_psql("checkpoint;")
+    assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000
+
+    # restart subscriber and ensure it can catch up lost tail again
+    sub.stop(mode="immediate")
+    for sk in env.safekeepers:
+        sk.start()
+    sub.start()
+    log.info("waiting for sync after restart")
+    logical_replication_wait_flush_lsn_sync(vanilla_pg)
+    assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 7bbe834c8c..ae07734e0f 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 7bbe834c8c2dc37802eca8484311599bc47341f6
+Subproject commit ae07734e0ff72759ab425fc8f625d4c1ecb15a50
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 9eba7dd382..47c8d462d1 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 9eba7dd382606ffca43aca865f337ec21bcdac73
+Subproject commit 47c8d462d169367c8979ce628a523be2d94b46be
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 5377f5ed72..6434b1499b 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3
+Subproject commit 6434b1499b11ed97dccea5618a055034b83b8e2f
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 570dfc1550..ab8b3b3c4f 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,14 @@
 {
-  "v16": ["16.3", "5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3"],
-  "v15": ["15.7", "9eba7dd382606ffca43aca865f337ec21bcdac73"],
-  "v14": ["14.12", "7bbe834c8c2dc37802eca8484311599bc47341f6"]
+  "v16": [
+    "16.3",
+    "6434b1499b11ed97dccea5618a055034b83b8e2f"
+  ],
+  "v15": [
+    "15.7",
+    "47c8d462d169367c8979ce628a523be2d94b46be"
+  ],
+  "v14": [
+    "14.12",
+    "ae07734e0ff72759ab425fc8f625d4c1ecb15a50"
+  ]
 }

From 162424ad774505bf38fcda31af81efd4f22de9a2 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 18 Jul 2024 13:51:07 +0300
Subject: [PATCH 03/62] wip

---
 ...35-safekeeper-dynamic-membership-change.md | 329 ++++++++++++++++++
 1 file changed, 329 insertions(+)
 create mode 100644 docs/rfcs/035-safekeeper-dynamic-membership-change.md

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
new file mode 100644
index 0000000000..4872fbaf89
--- /dev/null
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -0,0 +1,329 @@
+# Safekeeper dynamic membership change
+
+To quickly recover from safekeeper node failures and do rebalancing we need to
+be able to change set of safekeepers the timeline resides on. The procedure must
+be safe (not lose committed log) regardless of safekeepers and compute state. It
+should be able to progress if any majority of old safekeeper set, any majority
+of new safekeeper set and compute are up and connected. This is known as a
+consensus membership change. It always involves two phases: 1) switch old
+majority to old + new configuration, preventing commits without acknowledge from
+the new set 2) bootstrap the new set by ensuring majority of the new set has all
+data which ever could have been committed before the first phase completed;
+after that switch is safe to finish. Without two phases switch to the new set
+which quorum might not intersect with quorum of the old set (and typical case of
+ABC -> ABD switch is an example of that, because quorums AC and BD don't
+intersect). Furthermore, procedure is typically carried out by the consensus
+leader, and so enumeration of configurations which establishes order between
+them is done through consensus log.
+
+In our case consensus leader is compute (walproposer), and we don't want to wake
+up all computes for the change. Neither we want to fully reimplement the leader
+logic second time outside compute. Because of that the proposed algorithm relies
+for issuing configurations on the external fault tolerant (distributed) strongly
+consisent storage with simple API: CAS (compare-and-swap) on the single key.
+Properly configured postgres suits this.
+
+In the system consensus is implemented at the timeline level, so algorithm below
+applies to the single timeline.
+
+## Algorithm
+
+### Definitions
+
+A SafekeeperId is
+```
+struct SafekeeperId {
+    node_id: NodeId,
+    // Not strictly required for this RFC but useful for asserts and potentially other purposes in the future
+    hostname: String,
+}
+```
+
+A configuration is
+
+```
+struct Configuration {
+    generation: Generation, // a number uniquely identifying configuration
+    sk_set: Vec<SafekeeperId>, // current safekeeper set
+    new_sk_set: Optional<Vec<SafekeeperId>>,
+}
+```
+
+Configuration with `new_set` present is used for the intermediate step during
+the change and called joint configuration. Generations establish order of
+generations: we say `c1` is higher than `c2` if `c1.generation` >
+`c2.generation`.
+
+### Persistently stored data changes
+
+Safekeeper starts storing its current configuration in the control file. Update
+of is atomic, so in-memory value always matches the persistent one.
+
+External CAS providing storage (let's call it configuration storage here) also
+stores configuration for each timeline. It is initialized with generation 1 and
+initial set of safekeepers during timeline creation. Executed CAS on it must
+never be lost.
+
+### Compute <-> safekeeper protocol changes
+
+`ProposerGreeting` message carries walproposer's configuration if it is already
+established (see below), else null.  `AcceptorGreeting` message carries
+safekeeper's current `Configuration`. All further messages (`VoteRequest`,
+`VoteResponse`, `ProposerElected`, `AppendRequest`, `AppendResponse`) carry
+generation number, of walproposer in case of wp->sk message or of safekeeper in
+case of sk->wp message.
+
+### Safekeeper changes
+
+Basic rule: once safekeeper observes configuration higher than his own it
+immediately switches to it.
+
+Safekeeper sends its current configuration in its first message to walproposer
+`AcceptorGreeting`. It refuses all other walproposer messages if the
+configuration generation in them is less than its current one. Namely, it
+refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
+response it sends its current configuration generation to let walproposer know.
+
+Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` 
+accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
+current one and ignores it otherwise. In any case it replies with
+```
+struct ConfigurationSwitchResponse {
+    conf: Configuration,
+    last_log_term: Term,
+    flush_lsn: Lsn,
+    term: Term, // not used by this RFC, but might be useful for observability
+}
+```
+
+### Compute (walproposer) changes
+
+Basic rule is that joint configuration requires votes from majorities in the
+both `set` and `new_sk_set`.
+
+Compute receives list of safekeepers to connect to from the control plane as
+currently and tries to communicate with all of them. However, the list does not
+define consensus members. Instead, on start walproposer tracks highest
+configuration it receives from `AcceptorGreeting`s. Once it assembles greetings
+from majority of `sk_set` and majority of `new_sk_set` (if it is present), it
+establishes this configuration as its own and moves to voting. 
+
+It should stop talking to safekeepers not listed in the configuration at this
+point, though it is not unsafe to continue doing so.
+
+To be elected it must receive votes from both majorites if `new_sk_set` is present.
+Similarly, to commit WAL it must receive flush acknowledge from both majorities.
+
+If walproposer hears from safekeeper configuration higher than his own (i.e.
+refusal to accept due to configuration change) it simply restarts.
+
+### Change algorithm
+
+The following algorithm can be executed anywhere having access to configuration
+storage and safekeepers. It is safe to interrupt / restart it and run multiple
+instances of it concurrently, though likely one of them won't make
+progress then. It accepts `desired_set: Vec<SafekeeperId>` as input. 
+
+Algorithm will refuse to make the change if it encounters previous interrupted
+change attempt, but in this case it will try to finish it.
+
+It will eventually converge if old majority, new majority and configuration
+storage are reachable.
+
+1) Fetch current timeline configuration from the configuration storage.
+2) If it is already joint one and `new_set` is different from `desired_set`
+   refuse to change. However, assign join conf to (in memory) var
+   `join_conf` and proceed to step 4 to finish the ongoing change.
+3) Else, create joint `joint_conf: Configuration`: increment current conf number
+   `n` and put `desired_set` to `new_sk_set`. Persist it in the configuration
+   storage by doing CAS on the current generation: change happens only if
+   current configuration number is still `n`. Apart from guaranteeing uniqueness
+   of configurations, CAS linearizes them, ensuring that new configuration is
+   created only following the previous one when we know that the transition is
+   safe. Failed CAS aborts the procedure.
+4) Call `PUT` `configuration` on safekeepers from the current set,
+   delivering them `joint_conf`. Collecting responses from majority is required
+   to proceed. If any response returned generation higher than 
+   `joint_conf.generation`, abort (another switch raced us). Otherwise, choose
+   max `<last_log_term, flush_lsn>` among responses and establish it as 
+   (in memory) `sync_position`. We can't finish switch until majority 
+   of the new set catches up to this position because data before it 
+   could be committed without ack from the new set.
+4) Initialize timeline on safekeeper(s) from `new_sk_set` where it 
+   doesn't exist yet by doing `pull_timeline` from current set. Doing 
+   that on majority of `new_sk_set` is enough to proceed, but it is 
+   reasonable to ensure that all `new_sk_set` members are initialized 
+   -- if some of them are down why are we migrating there?
+5) Call `PUT` `configuration` on safekeepers from the new set,
+   delivering them `joint_conf` and collecting their positions. This will
+   switch them to the `joint_conf` which generally won't be needed 
+   because `pull_timeline` already includes it and plus additionally would be
+   broadcast by compute. More importantly, we may proceed to the next step 
+   only when `<last_log_term, flush_lsn>` on the majority of the new set reached 
+   `sync_position`. Similarly, on the happy path this is not needed because 
+   `pull_timeline` already includes it. However, it is better to double
+    check to be safe. For example, timeline could have been created earlier e.g.
+    manually or after try-to-migrate, abort, try-to-migrate-again sequence.   
+6) Create `new_conf: Configuration` incrementing `join_conf` generation and having new 
+   safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration 
+   storage under one more CAS.
+7) Call `PUT` `configuration` on safekeepers from the new set,
+   delivering them `new_conf`. It is enough to deliver it to the majority 
+   of the new set; the rest can be updated by compute.
+
+I haven't put huge effort to make the description above very precise, because it
+is natural language prone to interpretations anyway. Instead I'd like to make TLA+
+spec of it.
+
+Description above focuses on safety. To make the flow practical and live, here a few more 
+considerations.
+1) It makes sense to ping new set to ensure it we are migrating to live node(s) before 
+  step 3.
+2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed we
+   can rollback to the old conf with one more CAS.
+3) On step 4 timeline might be already created on members of the new set for various reasons; 
+   the simplest is the procedure restart. There are more complicated scenarious like mentioned
+   in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving 
+   generations, so seems simpler to treat existing timeline as success. However, this also 
+   has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
+   the step 5 is never reached until compute is (re)awaken up to synchronize new member(s).
+   I don't think we'll observe this in practice, but can add waking up compute if needed.
+4) To do step 7 in case of failure immediately after completion of CAS in step 6, 
+   configuration storage should also have `delivered_to_majority` flag for non join configurations.
+
+## Implementation
+
+The procedure ought to be driven from somewhere. Obvious candidates are control
+plane and storage_controller; and as each of them already has db we don't want
+yet another storage. I propose to manage safekeepers in storage_controller
+because 1) since it is in rust it simplifies simulation testing (more on this
+below) 2) it already manages pageservers. 
+
+This assumes that migration will be fully usable only after we migrate all
+tenants/timelines to storage_controller. It is discussible whether we want also
+to manage pageserver attachments for all of these, but likely we do.
+
+This requires us to define
+
+### storage_controller <-> control plane interface
+
+First of all, control plane should
+[change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829)
+storing safekeepers per timeline instead of per tenant because we can't migrate
+tenants atomically. 
+
+The important question is how updated configuration is delivered from
+storage_controller to control plane to provide it to computes. As always, there
+are two options, pull and push. Let's do it the same push as with pageserver
+`/notify-attach` because 1) it keeps storage_controller out of critical compute
+start path 2) provides easier upgrade: there won't be such a thing as 'timeline
+managed by control plane / storcon', cplane just takes the value out of its db
+when needed 3) uniformity. It makes storage_controller responsible for retrying notifying
+control plane until it succeeds.
+
+So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and
+updates it in the db if the provided conf generation is higher (the cplane db
+should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it
+should update db which makes the call successful, and then try to schedule
+`apply_config` if possible, it is ok if not. storage_controller 
+should rate limit calling the endpoint, but likely this won't be needed, as migration
+throughput is limited by `pull_timeline`.
+
+Timeline (branch) creation in cplane should call storage_controller POST
+`tenant/:tenant_id/timeline` like it currently does for sharded tenants.
+Response should be augmented with `safekeeper_conf: Configuration`. The call
+should be retried until succeeds.
+
+Timeline deletion and tenant deletion in cplane should call appropriate
+storage_controller endpoints like it currently does for sharded tenants. The
+calls should be retried until they succeed.
+
+### storage_controller implementation
+
+Current 'load everything on startup and keep in memory' easy design is fine.
+Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16
+byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so
+10^6 of timelines shouldn't take more than 100MB.
+
+Similar to pageserver attachment Intents storage_controller would have in-memory
+`MigrationRequest` (or its absense) for each timeline and pool of tasks trying
+to make these request reality; this ensures one instance of storage_controller
+won't do several migrations on the same timeline concurrently. In the first
+version it is simpler to have more manual control and no retries, i.e. migration
+failure removes the request. Later we can build retries and automatic
+scheduling/migration.
+
+#### Schema
+
+`safekeepers` table mirroring current `nodes` should be added, except that for
+`scheduling_policy` field (maybe better name it `status`?) it is enough to have
+at least in the beginning only 3 fields: 1) `active` 2) `scheduling_disabled` 3)
+`decomissioned`.
+
+`timelines` table:
+```
+table! {
+    timelines {
+        timeline_id -> Varchar,
+        tenant_id -> Varchar,
+        generation -> Int4,
+        sk_set -> Jsonb, // list of safekeeper ids
+        new_sk_set -> Nullable<Jsonb>, // list of safekeeper ids, null if not join conf
+        delivered_to_majority -> Nullable<Bool>, // null if joint conf
+        cplane_notified_generation -> Int4,
+    }
+}
+```
+
+#### API
+
+
+
+#### Dealing with multiple instances of storage_controller
+
+neon_local, pytest
+
+## Testing
+
+## Integration with evicted timeline
+
+## Order of implementation and rollout
+
+note that 
+- core can be developed ignoring cplane integration (neon_local will use storcon, but prod not)
+- there is a lot of infra work and it woud be great to separate its rollout from the core
+- wp could ignore joint consensus for some time
+
+rough order:
+- add sk infra, but not enforce confs
+- change proto
+- add wp proto, but not enforce confs
+- implement storconn. It will be used and tested by neon_local.
+- implement cplane/storcon integration. Route branch creation/deletion 
+  through storcon. Then we can test migration of these branches, hm.
+  In principle sk choice from cplane can be removed at this point. 
+  However, that would be bad because before import 1) 
+  storconn doesn't know about existing project so can't colocate tenants 
+  2) neither it knows about capacity. So we could instead allow to set sk 
+  set in the branch creation request.
+  These cplane -> storconn calls should be under feature flag; 
+  rollback is safe.
+- finally import existing branches. Then we can drop cplane 
+  sk selection code.
+  also only at this point wp will always use generations and
+  so we can drop 'tli creation on connect'.
+
+
+## Possible optimizations
+
+`AcceptorRefusal` separate message
+
+Preserving connections (not neede)
+
+multiple joint consensus (not neede)
+
+## Misc
+
+We should use Compute <-> safekeeper protocol change to include other (long
+yearned) modifications:
+

From 1e789fb9631ecb394b18c9f051d3775f2234272f Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 19 Jul 2024 18:06:10 +0300
Subject: [PATCH 04/62] wipwip

---
 ...35-safekeeper-dynamic-membership-change.md | 171 +++++++++++++++---
 1 file changed, 144 insertions(+), 27 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index 4872fbaf89..2fc3f2066b 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -30,21 +30,12 @@ applies to the single timeline.
 
 ### Definitions
 
-A SafekeeperId is
-```
-struct SafekeeperId {
-    node_id: NodeId,
-    // Not strictly required for this RFC but useful for asserts and potentially other purposes in the future
-    hostname: String,
-}
-```
-
 A configuration is
 
 ```
 struct Configuration {
     generation: Generation, // a number uniquely identifying configuration
-    sk_set: Vec<SafekeeperId>, // current safekeeper set
+    sk_set: Vec<NodeId>, // current safekeeper set
     new_sk_set: Optional<Vec<SafekeeperId>>,
 }
 ```
@@ -76,7 +67,13 @@ case of sk->wp message.
 ### Safekeeper changes
 
 Basic rule: once safekeeper observes configuration higher than his own it
-immediately switches to it.
+immediately switches to it. It must refuse all messages with lower generation
+that his. It also refuses messages if it is not member of the current
+generation, though it is likely not unsafe to process them (walproposer should
+ignore them anyway).
+
+If there is non null configuration in `ProposerGreeting` and it is higher than
+current safekeeper one, safekeeper switches to it.
 
 Safekeeper sends its current configuration in its first message to walproposer
 `AcceptorGreeting`. It refuses all other walproposer messages if the
@@ -122,7 +119,7 @@ refusal to accept due to configuration change) it simply restarts.
 The following algorithm can be executed anywhere having access to configuration
 storage and safekeepers. It is safe to interrupt / restart it and run multiple
 instances of it concurrently, though likely one of them won't make
-progress then. It accepts `desired_set: Vec<SafekeeperId>` as input. 
+progress then. It accepts `desired_set: Vec<NodeId>` as input. 
 
 Algorithm will refuse to make the change if it encounters previous interrupted
 change attempt, but in this case it will try to finish it.
@@ -150,10 +147,10 @@ storage are reachable.
    of the new set catches up to this position because data before it 
    could be committed without ack from the new set.
 4) Initialize timeline on safekeeper(s) from `new_sk_set` where it 
-   doesn't exist yet by doing `pull_timeline` from current set. Doing 
-   that on majority of `new_sk_set` is enough to proceed, but it is 
-   reasonable to ensure that all `new_sk_set` members are initialized 
-   -- if some of them are down why are we migrating there?
+   doesn't exist yet by doing `pull_timeline` from the majority of the 
+   current set. Doing  that on majority of `new_sk_set` is enough to
+   proceed, but it is reasonable to ensure that all `new_sk_set` members
+   are initialized -- if some of them are down why are we migrating there?
 5) Call `PUT` `configuration` on safekeepers from the new set,
    delivering them `joint_conf` and collecting their positions. This will
    switch them to the `joint_conf` which generally won't be needed 
@@ -179,8 +176,8 @@ Description above focuses on safety. To make the flow practical and live, here a
 considerations.
 1) It makes sense to ping new set to ensure it we are migrating to live node(s) before 
   step 3.
-2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed we
-   can rollback to the old conf with one more CAS.
+2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed 
+   it is safe to rollback to the old conf with one more CAS.
 3) On step 4 timeline might be already created on members of the new set for various reasons; 
    the simplest is the procedure restart. There are more complicated scenarious like mentioned
    in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving 
@@ -188,8 +185,11 @@ considerations.
    has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
    the step 5 is never reached until compute is (re)awaken up to synchronize new member(s).
    I don't think we'll observe this in practice, but can add waking up compute if needed.
-4) To do step 7 in case of failure immediately after completion of CAS in step 6, 
-   configuration storage should also have `delivered_to_majority` flag for non join configurations.
+4) In the end timeline should be locally deleted on the safekeeper(s) which are
+   in the old set but not in the new one, unless they are unreachable. To be
+   safe this also should be done under generation number.
+5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
+   jump to step 7, using it as `new_conf`.
 
 ## Implementation
 
@@ -251,25 +251,37 @@ to make these request reality; this ensures one instance of storage_controller
 won't do several migrations on the same timeline concurrently. In the first
 version it is simpler to have more manual control and no retries, i.e. migration
 failure removes the request. Later we can build retries and automatic
-scheduling/migration.
+scheduling/migration. `MigrationRequest` is
+```
+enum MigrationRequest {
+    To(Vec<NodeId>),
+    FinishPending,
+}
+```
+
+`FinishPending` requests to run the procedure to ensure state is clean: current
+configuration is not joint and majority of safekeepers are aware of it, but do
+not attempt to migrate anywhere. If current configuration fetched on step 1 is
+not joint it jumps to step 7. It should be run at startup for all timelines (but
+similarly, in the first version it is ok to trigger it manually).
 
 #### Schema
 
 `safekeepers` table mirroring current `nodes` should be added, except that for
-`scheduling_policy` field (maybe better name it `status`?) it is enough to have
-at least in the beginning only 3 fields: 1) `active` 2) `scheduling_disabled` 3)
+`scheduling_policy` field (seems like `status` is a better name for it): it is enough
+to have at least in the beginning only 3 fields: 1) `active` 2) `unavailable` 3)
 `decomissioned`.
 
 `timelines` table:
 ```
 table! {
-    timelines {
+    // timeline_id is primary key
+    timelines (timeline_id) {
         timeline_id -> Varchar,
         tenant_id -> Varchar,
         generation -> Int4,
         sk_set -> Jsonb, // list of safekeeper ids
         new_sk_set -> Nullable<Jsonb>, // list of safekeeper ids, null if not join conf
-        delivered_to_majority -> Nullable<Bool>, // null if joint conf
         cplane_notified_generation -> Int4,
     }
 }
@@ -277,15 +289,117 @@ table! {
 
 #### API
 
+Node management is similar to pageserver:
+1) POST `/control/v1/safekeepers` upserts safekeeper.
+2) GET `/control/v1/safekeepers` lists safekeepers.
+3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
+4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
+   `unavailable` or `decomissioned`. Initially it is simpler not to schedule any
+    migrations here.
 
+Safekeeper deploy scripts should register safekeeper at storage_contorller as
+they currently do with cplane, under the same id.
+
+Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline`
+would 1) choose initial set of safekeepers; 2) write to the db initial
+`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in
+case of conflict; 3) create timeline on the majority of safekeepers (already
+created is ok).
+
+We don't want to block timeline creation when one safekeeper is down. Currently
+this is solved by compute implicitly creating timeline on any safekeeper it is
+connected to. This creates ugly timeline state on safekeeper when timeline is
+created, but start LSN is not defined yet. It would be nice to remove this; to
+do that, controller can in the background retry to create timeline on
+safekeeper(s) which missed that during initial creation call. It can do that
+through `pull_timeline` from majority so it doesn't need to remember
+`parent_lsn` in its db.
+
+Timeline deletion removes the row from the db and forwards deletion to the
+current configuration members. Without additional actions deletions might leak,
+see below on this; initially let's ignore these, reporting to cplane success if
+at least one safekeeper deleted the timeline (this will remove s3 data).
+
+Tenant deletion repeats timeline deletion for all timelines.
+
+Migration API: the first version is the simplest and the most imperative:
+1) PUT `/control/v1/safekeepers/migrate` schedules `MigrationRequest`s to move
+all timelines from one safekeeper to another. It accepts json
+```
+{
+    "src_sk": u32,
+    "dst_sk": u32,
+    "limit": Optional<u32>,
+}
+```
+
+Returns list of scheduled requests.
+
+2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest`
+   to move single timeline to given set of safekeepers:
+{
+    "desired_set": Vec<u32>,
+}
+
+Returns scheduled request.
+
+Similar call should be added for the tenant.
+
+It would be great to have some way of subscribing to the results (appart from
+looking at logs/metrics).
+
+Migration is executed as described above. One subtlety is that (local) deletion on
+source safekeeper might fail, which is not a problem if we are going to
+decomission the node but leaves garbage otherwise. I'd propose in the first version
+1) Don't attempt deletion at all if node status is `unavailable`.
+2) If it failed, just issue warning.
+And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and 
+remove garbage timelines for manual use. It will 1) list all timelines on the 
+safekeeper 2) compare each one against configuration storage: if timeline 
+doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can 
+be deleted under generation number if node is not member of current generation.
+
+Automating this is untrivial; we'd need to register all potential missing
+deletions <tenant_id, timeline_id, generation, node_id> in the same transaction
+which switches configurations. Similarly when timeline is fully deleted to
+prevent cplane operation from blocking when some safekeeper is not available
+deletion should be also registered.
+
+3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return
+   current in memory state of the timeline and pending `MigrationRequest`,
+   if any.
+
+4) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort` tries to abort the
+   migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS
+   (incrementing generation as always).
 
 #### Dealing with multiple instances of storage_controller
 
-neon_local, pytest
+Operations described above executed concurrently might create some errors but do
+not prevent progress, so while we normally don't want to run multiple instances
+of storage_controller it is fine to have it temporarily, e.g. during redeploy.
+
+Any interactions with db update in-memory controller state, e.g. if migration
+request failed because different one is in progress, controller remembers that
+and tries to finish it.
 
 ## Testing
 
-## Integration with evicted timeline
+`neon_local` should be switched to use storage_controller, playing role of
+control plane.
+
+There should be following layers of tests:
+1) Model checked TLA+ spec specifies the algorithm and verifies its basic safety.
+
+2) To cover real code and at the same time test many schedules we should have
+   simulation tests. For that, configuration storage, storage_controller <->
+   safekeeper communication and pull_timeline need to be mocked and main switch
+   procedure wrapped to as a node (thread) in simulation tests, using these
+   mocks. Test would inject migrations like it currently injects
+   safekeeper/walproposer restars. Main assert is the same -- committed WAL must
+   not be lost.
+
+3) Additionally it would be good to have basic tests covering the whole system.
 
 ## Order of implementation and rollout
 
@@ -294,6 +408,8 @@ note that
 - there is a lot of infra work and it woud be great to separate its rollout from the core
 - wp could ignore joint consensus for some time
 
+TimelineCreateRequest should get optional safekeepers field with safekeepers chosen by cplane.
+
 rough order:
 - add sk infra, but not enforce confs
 - change proto
@@ -313,6 +429,7 @@ rough order:
   also only at this point wp will always use generations and
   so we can drop 'tli creation on connect'.
 
+## Integration with evicted timelines
 
 ## Possible optimizations
 

From 7b50c1a4576fbc3283e06403586915824e5c3ee6 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 22 Jul 2024 16:25:06 +0300
Subject: [PATCH 05/62] more wip

ref https://github.com/neondatabase/cloud/issues/14668
---
 ...35-safekeeper-dynamic-membership-change.md | 25 +++++++++++++------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index 2fc3f2066b..e9183c9829 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -36,7 +36,7 @@ A configuration is
 struct Configuration {
     generation: Generation, // a number uniquely identifying configuration
     sk_set: Vec<NodeId>, // current safekeeper set
-    new_sk_set: Optional<Vec<SafekeeperId>>,
+    new_sk_set: Optional<Vec<NodeId>>,
 }
 ```
 
@@ -337,9 +337,11 @@ Returns list of scheduled requests.
 
 2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest`
    to move single timeline to given set of safekeepers:
+```
 {
     "desired_set": Vec<u32>,
 }
+```
 
 Returns scheduled request.
 
@@ -399,14 +401,22 @@ There should be following layers of tests:
    safekeeper/walproposer restars. Main assert is the same -- committed WAL must
    not be lost.
 
-3) Additionally it would be good to have basic tests covering the whole system.
+3) Since simulation testing injects at relatively high level points (not
+   syscalls), it omits some code, in particular `pull_timeline`. Thus it 
+   is better to have basic tests covering whole system. Extended 
+   version of `test_restarts_under_load` would do. TBD
+
+4) Basic e2e test should ensure that full flow including cplane notification works.
 
 ## Order of implementation and rollout
 
 note that 
-- core can be developed ignoring cplane integration (neon_local will use storcon, but prod not)
-- there is a lot of infra work and it woud be great to separate its rollout from the core
-- wp could ignore joint consensus for some time
+- Control plane parts and integration with it is fully independent from everything else
+  (tests would use simulation and neon_local).
+- There is a lot of infra work making storage_controller aware of timelines and safekeepers
+  and its impl/rollout should be separate from migration itself.
+- Initially walproposer can just stop working while it observers joint configuration.
+  Such window would be typically very short anyway.
 
 TimelineCreateRequest should get optional safekeepers field with safekeepers chosen by cplane.
 
@@ -435,12 +445,13 @@ rough order:
 
 `AcceptorRefusal` separate message
 
-Preserving connections (not neede)
+Preserving connections (not needed)
 
-multiple joint consensus (not neede)
+multiple joint consensus (not needed)
 
 ## Misc
 
 We should use Compute <-> safekeeper protocol change to include other (long
 yearned) modifications:
+- network order
 

From 4d1cf2dc6f6406f51333af0a495146fe3dbb9153 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 23 Jul 2024 17:58:32 +0300
Subject: [PATCH 06/62] tests, rollout

---
 ...35-safekeeper-dynamic-membership-change.md | 94 ++++++++++++-------
 1 file changed, 62 insertions(+), 32 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index e9183c9829..88087270d6 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -347,7 +347,7 @@ Returns scheduled request.
 
 Similar call should be added for the tenant.
 
-It would be great to have some way of subscribing to the results (appart from
+It would be great to have some way of subscribing to the results (apart from
 looking at logs/metrics).
 
 Migration is executed as described above. One subtlety is that (local) deletion on
@@ -367,6 +367,9 @@ which switches configurations. Similarly when timeline is fully deleted to
 prevent cplane operation from blocking when some safekeeper is not available
 deletion should be also registered.
 
+One more task pool should infinitely retry notifying control plane about changed
+safekeeper sets.
+
 3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return
    current in memory state of the timeline and pending `MigrationRequest`,
    if any.
@@ -402,15 +405,19 @@ There should be following layers of tests:
    not be lost.
 
 3) Since simulation testing injects at relatively high level points (not
-   syscalls), it omits some code, in particular `pull_timeline`. Thus it 
-   is better to have basic tests covering whole system. Extended 
-   version of `test_restarts_under_load` would do. TBD
+   syscalls), it omits some code, in particular `pull_timeline`. Thus it is
+   better to have basic tests covering whole system as well. Extended version of
+   `test_restarts_under_load` would do: start background load and do migration 
+   under it, then restart endpoint and check that no reported commits 
+   had been lost. I'd also add one more creating classic network split scenario, with
+   one compute talking to AC and another to BD while migration from nodes ABC to ABD
+   happens.
 
-4) Basic e2e test should ensure that full flow including cplane notification works.
+4) Simple e2e test should ensure that full flow including cplane notification works.
 
 ## Order of implementation and rollout
 
-note that 
+Note that 
 - Control plane parts and integration with it is fully independent from everything else
   (tests would use simulation and neon_local).
 - There is a lot of infra work making storage_controller aware of timelines and safekeepers
@@ -418,40 +425,63 @@ note that
 - Initially walproposer can just stop working while it observers joint configuration.
   Such window would be typically very short anyway.
 
-TimelineCreateRequest should get optional safekeepers field with safekeepers chosen by cplane.
+To rollout smoothly, both walproposer and safekeeper should have flag
+`configurations_enabled`; when set to false, they would work as currently, i.e.
+walproposer is able to commit on whatever safekeeper set it is provided. Until
+all timelines are managed by storcon we'd need to use current script to migrate
+and update/drop entries in the storage_controller database if it has any.
 
-rough order:
-- add sk infra, but not enforce confs
-- change proto
-- add wp proto, but not enforce confs
-- implement storconn. It will be used and tested by neon_local.
-- implement cplane/storcon integration. Route branch creation/deletion 
-  through storcon. Then we can test migration of these branches, hm.
-  In principle sk choice from cplane can be removed at this point. 
-  However, that would be bad because before import 1) 
-  storconn doesn't know about existing project so can't colocate tenants 
-  2) neither it knows about capacity. So we could instead allow to set sk 
-  set in the branch creation request.
-  These cplane -> storconn calls should be under feature flag; 
-  rollback is safe.
-- finally import existing branches. Then we can drop cplane 
-  sk selection code.
-  also only at this point wp will always use generations and
-  so we can drop 'tli creation on connect'.
+Safekeepers would need to be able to talk both current and new protocol version
+with compute to reduce number of computes restarted in prod once v2 protocol is
+deployed (though before completely switching we'd need to force this).
+
+Let's have the following rollout order:
+- storage_controller becomes aware of safekeepers;
+- storage_controller gets timeline creation for new timelines and deletion requests, but
+  doesn't manage all timelines yet. Migration can be tested on these new timelines.
+  To keep control plane and storage_controller databases in sync while control 
+  plane still chooses the safekeepers initially (until all timelines are imported
+  it can choose better), `TimelineCreateRequest` can get optional safekeepers
+  field with safekeepers chosen by cplane.
+- Then we can import all existing timelines from control plane to
+  storage_controller and gradually enable configurations region by region.
+
+
+Very rough implementation order:
+- Add concept of configurations to safekeepers (including control file),
+  implement v3 protocol.
+- Implement walproposer changes, including protocol.
+- Implement storconn part. Use it in neon_local (and pytest).
+- Make cplane store safekeepers per timeline instead of per tenant.
+- Implement cplane/storcon integration. Route branch creation/deletion 
+  through storcon. Then we can test migration of new branches.
+- Finally import existing branches. Then we can drop cplane 
+  safekeeper selection code. Gradually enable configurations at 
+  computes and safekeepers. Before that, all computes must talk only
+  v3 protocol version.
 
 ## Integration with evicted timelines
 
+Currently, `pull_timeline` doesn't work correctly with evicted timelines because
+copy would point to original partial file. To fix let's just do s3 copy of the
+file. It is a bit stupid as generally unnecessary work, but it makes sense to
+implement proper migration before doing smarter timeline archival.
+
 ## Possible optimizations
 
-`AcceptorRefusal` separate message
 
-Preserving connections (not needed)
+Algorithm suggested above forces walproposer re-election (technically restart)
+and thus reconnection to safekeepers; essentially we treat generation as part of
+term and don't allow leader to survive configuration change. It is possible to
+optimize this, but this is untrivial and I don't think needed. Reconnection is
+very fast and it is much more important to avoid compute restart than
+millisecond order of write stall.
 
-multiple joint consensus (not needed)
+Multiple joint consensus: algorithm above rejects attempt to change membership
+while another attempt is in progress. It is possible to overlay them and AFAIK
+Aurora does this but similarly I don't think this is needed.
 
 ## Misc
 
-We should use Compute <-> safekeeper protocol change to include other (long
-yearned) modifications:
-- network order
-
+We should use Compute <-> safekeeper protocol change to include another long
+yearned modifications: send data in network order to make arm work.

From c9d2b6119576d8bd5f98460ac249db468b51bc7a Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 2 Aug 2024 12:28:11 +0300
Subject: [PATCH 07/62] fix term uniqueness

---
 ...35-safekeeper-dynamic-membership-change.md | 51 ++++++++++---------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index 88087270d6..ed831f1492 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -68,9 +68,9 @@ case of sk->wp message.
 
 Basic rule: once safekeeper observes configuration higher than his own it
 immediately switches to it. It must refuse all messages with lower generation
-that his. It also refuses messages if it is not member of the current
-generation, though it is likely not unsafe to process them (walproposer should
-ignore them anyway).
+that his. It also refuses messages if it is not member of the current generation
+(that is, of either `sk_set` of `sk_new_set`), though it is likely not unsafe to
+process them (walproposer should ignore result anyway).
 
 If there is non null configuration in `ProposerGreeting` and it is higher than
 current safekeeper one, safekeeper switches to it.
@@ -87,9 +87,9 @@ current one and ignores it otherwise. In any case it replies with
 ```
 struct ConfigurationSwitchResponse {
     conf: Configuration,
+    term: Term,
     last_log_term: Term,
     flush_lsn: Lsn,
-    term: Term, // not used by this RFC, but might be useful for observability
 }
 ```
 
@@ -142,29 +142,33 @@ storage are reachable.
    delivering them `joint_conf`. Collecting responses from majority is required
    to proceed. If any response returned generation higher than 
    `joint_conf.generation`, abort (another switch raced us). Otherwise, choose
-   max `<last_log_term, flush_lsn>` among responses and establish it as 
-   (in memory) `sync_position`. We can't finish switch until majority 
-   of the new set catches up to this position because data before it 
-   could be committed without ack from the new set.
-4) Initialize timeline on safekeeper(s) from `new_sk_set` where it 
+   max `<last_log_term, flush_lsn>` among responses and establish it as
+   (in memory) `sync_position`. Also choose max `term` and establish it as (in
+   memory) `sync_term`. We can't finish the switch until majority of the new set
+   catches up to this `sync_position` because data before it could be committed
+   without ack from the new set. Similarly, we'll bump term on new majority
+   to `sync_term` so that two computes with the same term are never elected.
+4) Initialize timeline on safekeeper(s) from `new_sk_set` where it
    doesn't exist yet by doing `pull_timeline` from the majority of the 
-   current set. Doing  that on majority of `new_sk_set` is enough to
+   current set. Doing that on majority of `new_sk_set` is enough to
    proceed, but it is reasonable to ensure that all `new_sk_set` members
    are initialized -- if some of them are down why are we migrating there?
-5) Call `PUT` `configuration` on safekeepers from the new set,
+5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. 
+   Success on majority is enough.
+6) Repeatedly call `PUT` `configuration` on safekeepers from the new set,
    delivering them `joint_conf` and collecting their positions. This will
    switch them to the `joint_conf` which generally won't be needed 
    because `pull_timeline` already includes it and plus additionally would be
-   broadcast by compute. More importantly, we may proceed to the next step 
+   broadcast by compute. More importantly, we may proceed to the next step
    only when `<last_log_term, flush_lsn>` on the majority of the new set reached 
-   `sync_position`. Similarly, on the happy path this is not needed because 
-   `pull_timeline` already includes it. However, it is better to double
+   `sync_position`. Similarly, on the happy path no waiting is not needed because 
+   `pull_timeline` already includes it. However, we should double
     check to be safe. For example, timeline could have been created earlier e.g.
-    manually or after try-to-migrate, abort, try-to-migrate-again sequence.   
-6) Create `new_conf: Configuration` incrementing `join_conf` generation and having new 
+    manually or after try-to-migrate, abort, try-to-migrate-again sequence. 
+7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new 
    safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration 
    storage under one more CAS.
-7) Call `PUT` `configuration` on safekeepers from the new set,
+8) Call `PUT` `configuration` on safekeepers from the new set,
    delivering them `new_conf`. It is enough to deliver it to the majority 
    of the new set; the rest can be updated by compute.
 
@@ -469,13 +473,12 @@ implement proper migration before doing smarter timeline archival.
 
 ## Possible optimizations
 
-
-Algorithm suggested above forces walproposer re-election (technically restart)
-and thus reconnection to safekeepers; essentially we treat generation as part of
-term and don't allow leader to survive configuration change. It is possible to
-optimize this, but this is untrivial and I don't think needed. Reconnection is
-very fast and it is much more important to avoid compute restart than
-millisecond order of write stall.
+Steps above suggest walproposer restart (with re-election) and thus reconnection
+to safekeepers. Since by bumping term on new majority we ensure that leader
+terms are unique even across generation switches it is possible to preserve
+connections. However, it is more complicated, reconnection is very fast and it
+is much more important to avoid compute restart than millisecond order of write
+stall.
 
 Multiple joint consensus: algorithm above rejects attempt to change membership
 while another attempt is in progress. It is possible to overlay them and AFAIK

From 28ef1522d63b31e8735fa84e45cfd6d336972dfc Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 2 Aug 2024 13:46:32 +0300
Subject: [PATCH 08/62] cosmetic fixes

---
 docs/rfcs/035-safekeeper-dynamic-membership-change.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index ed831f1492..0d7396cf93 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -191,7 +191,8 @@ considerations.
    I don't think we'll observe this in practice, but can add waking up compute if needed.
 4) In the end timeline should be locally deleted on the safekeeper(s) which are
    in the old set but not in the new one, unless they are unreachable. To be
-   safe this also should be done under generation number.
+   safe this also should be done under generation number (deletion proceeds only if 
+   current configuration is <= than one in request and safekeeper is not memeber of it).
 5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
    jump to step 7, using it as `new_conf`.
 
@@ -207,7 +208,7 @@ This assumes that migration will be fully usable only after we migrate all
 tenants/timelines to storage_controller. It is discussible whether we want also
 to manage pageserver attachments for all of these, but likely we do.
 
-This requires us to define
+This requires us to define storcon <-> cplane interface.
 
 ### storage_controller <-> control plane interface
 
@@ -273,14 +274,14 @@ similarly, in the first version it is ok to trigger it manually).
 
 `safekeepers` table mirroring current `nodes` should be added, except that for
 `scheduling_policy` field (seems like `status` is a better name for it): it is enough
-to have at least in the beginning only 3 fields: 1) `active` 2) `unavailable` 3)
+to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3)
 `decomissioned`.
 
 `timelines` table:
 ```
 table! {
     // timeline_id is primary key
-    timelines (timeline_id) {
+    timelines (tenant_id, timeline_id) {
         timeline_id -> Varchar,
         tenant_id -> Varchar,
         generation -> Int4,

From 930763cad2278a65b64cabaa231ea9b356f479ca Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Aug 2024 17:25:49 +0300
Subject: [PATCH 09/62] s/jsonb/array

---
 docs/rfcs/035-safekeeper-dynamic-membership-change.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index 0d7396cf93..307606da38 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -285,8 +285,8 @@ table! {
         timeline_id -> Varchar,
         tenant_id -> Varchar,
         generation -> Int4,
-        sk_set -> Jsonb, // list of safekeeper ids
-        new_sk_set -> Nullable<Jsonb>, // list of safekeeper ids, null if not join conf
+        sk_set -> Array<Int4>, // list of safekeeper ids
+        new_sk_set -> Nullable<Array<Int4>>, // list of safekeeper ids, null if not joint conf
         cplane_notified_generation -> Int4,
     }
 }
@@ -299,7 +299,7 @@ Node management is similar to pageserver:
 2) GET `/control/v1/safekeepers` lists safekeepers.
 3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
 4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
-   `unavailable` or `decomissioned`. Initially it is simpler not to schedule any
+   `offline` or `decomissioned`. Initially it is simpler not to schedule any
     migrations here.
 
 Safekeeper deploy scripts should register safekeeper at storage_contorller as
@@ -358,7 +358,7 @@ looking at logs/metrics).
 Migration is executed as described above. One subtlety is that (local) deletion on
 source safekeeper might fail, which is not a problem if we are going to
 decomission the node but leaves garbage otherwise. I'd propose in the first version
-1) Don't attempt deletion at all if node status is `unavailable`.
+1) Don't attempt deletion at all if node status is `offline`.
 2) If it failed, just issue warning.
 And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and 
 remove garbage timelines for manual use. It will 1) list all timelines on the 
@@ -470,7 +470,7 @@ Very rough implementation order:
 Currently, `pull_timeline` doesn't work correctly with evicted timelines because
 copy would point to original partial file. To fix let's just do s3 copy of the
 file. It is a bit stupid as generally unnecessary work, but it makes sense to
-implement proper migration before doing smarter timeline archival.
+implement proper migration before doing smarter timeline archival. [Issue](https://github.com/neondatabase/neon/issues/8542)
 
 ## Possible optimizations
 

From 06df6ca52e3b245727fbd76a21050ec98c8e83e1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Aug 2024 17:37:59 +0300
Subject: [PATCH 10/62] proto changes

---
 docs/rfcs/035-safekeeper-dynamic-membership-change.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index 307606da38..239ec58186 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -487,5 +487,9 @@ Aurora does this but similarly I don't think this is needed.
 
 ## Misc
 
-We should use Compute <-> safekeeper protocol change to include another long
-yearned modifications: send data in network order to make arm work.
+We should use Compute <-> safekeeper protocol change to include other (long
+yearned) modifications:
+- send data in network order to make arm work.
+- remove term_start_lsn from AppendRequest
+- add horizon to TermHistory
+- add to ProposerGreeting number of connection from this wp to sk

From 41b5ee491edc75d3135a4e2b6b8a045244c3d6f7 Mon Sep 17 00:00:00 2001
From: Shinya Kato <37682778+shinyaaa@users.noreply.github.com>
Date: Mon, 12 Aug 2024 21:24:25 +0900
Subject: [PATCH 11/62] Fix a comment in walproposer_pg.c (#8583)

## Problem
Perhaps there is an error in the source code comment.

## Summary of changes
Fix "walsender" to "walproposer"
---
 pgxn/neon/walproposer_pg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 944b316344..f3ddc64061 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -512,7 +512,7 @@ replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRe
 }
 
 /*
- * Start walsender streaming replication
+ * Start walproposer streaming replication
  */
 static void
 walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)

From 1b9a27d6e30f086f3ce8a41a617ae551ea0a4b0a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 12 Aug 2024 13:33:09 +0100
Subject: [PATCH 12/62] tests: reinstate test_bulk_insert (#8683)

## Problem

This test was disabled.

## Summary of changes

- Remove the skip marker.
- Explicitly avoid doing compaction & gc during checkpoints (the default
scale doesn't do anything here, but when experimeting with larger scales
it messes things up)
- Set a data size that gives a ~20s runtime on a Hetzner dev machine,
previous one gave very noisy results because it was so small


For reference on a Hetzner AX102:
```
------------------------------ Benchmark results -------------------------------
test_bulk_insert[neon-release-pg16].insert: 25.664 s
test_bulk_insert[neon-release-pg16].pageserver_writes: 5,428 MB
test_bulk_insert[neon-release-pg16].peak_mem: 577 MB
test_bulk_insert[neon-release-pg16].size: 0 MB
test_bulk_insert[neon-release-pg16].data_uploaded: 1,922 MB
test_bulk_insert[neon-release-pg16].num_files_uploaded: 8
test_bulk_insert[neon-release-pg16].wal_written: 1,382 MB
test_bulk_insert[neon-release-pg16].wal_recovery: 25.373 s
test_bulk_insert[neon-release-pg16].compaction: 0.035 s
```
---
 test_runner/fixtures/compare_fixtures.py    | 25 ++++++++++++++++-----
 test_runner/performance/test_bulk_insert.py | 14 +++++++-----
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 08215438e1..5fe544b3bd 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -42,7 +42,11 @@ class PgCompare(ABC):
         pass
 
     @abstractmethod
-    def flush(self):
+    def flush(self, compact: bool = False, gc: bool = False):
+        pass
+
+    @abstractmethod
+    def compact(self):
         pass
 
     @abstractmethod
@@ -129,13 +133,16 @@ class NeonCompare(PgCompare):
     def pg_bin(self) -> PgBin:
         return self._pg_bin
 
-    def flush(self):
+    def flush(self, compact: bool = True, gc: bool = True):
         wait_for_last_flush_lsn(self.env, self._pg, self.tenant, self.timeline)
-        self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline)
-        self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0)
+        self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline, compact=compact)
+        if gc:
+            self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0)
 
     def compact(self):
-        self.pageserver_http_client.timeline_compact(self.tenant, self.timeline)
+        self.pageserver_http_client.timeline_compact(
+            self.tenant, self.timeline, wait_until_uploaded=True
+        )
 
     def report_peak_memory_use(self):
         self.zenbenchmark.record(
@@ -215,9 +222,12 @@ class VanillaCompare(PgCompare):
     def pg_bin(self) -> PgBin:
         return self._pg.pg_bin
 
-    def flush(self):
+    def flush(self, compact: bool = False, gc: bool = False):
         self.cur.execute("checkpoint")
 
+    def compact(self):
+        pass
+
     def report_peak_memory_use(self):
         pass  # TODO find something
 
@@ -266,6 +276,9 @@ class RemoteCompare(PgCompare):
         # TODO: flush the remote pageserver
         pass
 
+    def compact(self):
+        pass
+
     def report_peak_memory_use(self):
         # TODO: get memory usage from remote pageserver
         pass
diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index 3dad348976..69df7974b9 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -1,9 +1,9 @@
 from contextlib import closing
 
-import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare
+from fixtures.log_helper import log
 from fixtures.pg_version import PgVersion
 
 
@@ -17,7 +17,6 @@ from fixtures.pg_version import PgVersion
 # 3. Disk space used
 # 4. Peak memory usage
 #
-@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/7124")
 def test_bulk_insert(neon_with_baseline: PgCompare):
     env = neon_with_baseline
 
@@ -30,8 +29,8 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
             # Run INSERT, recording the time and I/O it takes
             with env.record_pageserver_writes("pageserver_writes"):
                 with env.record_duration("insert"):
-                    cur.execute("insert into huge values (generate_series(1, 5000000), 0);")
-                    env.flush()
+                    cur.execute("insert into huge values (generate_series(1, 20000000), 0);")
+                    env.flush(compact=False, gc=False)
 
             env.report_peak_memory_use()
             env.report_size()
@@ -49,6 +48,9 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
     if isinstance(env, NeonCompare):
         measure_recovery_time(env)
 
+    with env.record_duration("compaction"):
+        env.compact()
+
 
 def measure_recovery_time(env: NeonCompare):
     client = env.env.pageserver.http_client()
@@ -71,7 +73,9 @@ def measure_recovery_time(env: NeonCompare):
 
     # Measure recovery time
     with env.record_duration("wal_recovery"):
+        log.info("Entering recovery...")
         client.timeline_create(pg_version, env.tenant, env.timeline)
 
         # Flush, which will also wait for lsn to catch up
-        env.flush()
+        env.flush(compact=False, gc=False)
+        log.info("Finished recovery.")

From 9dc9a9b2e950638b4fc018e1254879cbb430ba6a Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 12 Aug 2024 15:37:15 +0300
Subject: [PATCH 13/62] test: do graceful shutdown by default (#8655)

It should give us all possible allowed_errors more consistently.

While getting the workflows to pass on
https://github.com/neondatabase/neon/pull/8632 it was noticed that
allowed_errors are rarely hit (1/4). This made me realize that we always
do an immediate stop by default. Doing a graceful shutdown would had
made the draining more apparent and likely we would not have needed the
#8632 hotfix.

Downside of doing this is that we will see more timeouts if tests are
randomly leaving pause failpoints which fail the shutdown.

The net outcome should however be positive, we could even detect too
slow shutdowns caused by a bug or deadlock.
---
 pageserver/src/tenant.rs                      | 10 +++++
 .../src/tenant/remote_timeline_client.rs      |  5 ++-
 .../src/tenant/storage_layer/image_layer.rs   |  3 --
 pageserver/src/tenant/storage_layer/layer.rs  |  4 +-
 pageserver/src/tenant/tasks.rs                | 13 +++---
 pageserver/src/tenant/timeline.rs             | 40 ++++++++-----------
 pageserver/src/tenant/timeline/compaction.rs  | 22 ++++------
 .../walreceiver/walreceiver_connection.rs     |  3 ++
 test_runner/fixtures/neon_fixtures.py         |  2 +-
 test_runner/regress/test_ancestor_branch.py   |  6 ++-
 test_runner/regress/test_timeline_size.py     |  7 ++++
 11 files changed, 63 insertions(+), 52 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 90c0e28bc4..cfdb32f755 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -41,6 +41,7 @@ use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use upload_queue::NotInitialized;
 use utils::backoff;
 use utils::circuit_breaker::CircuitBreaker;
 use utils::completion;
@@ -601,6 +602,15 @@ impl From<PageReconstructError> for GcError {
     }
 }
 
+impl From<NotInitialized> for GcError {
+    fn from(value: NotInitialized) -> Self {
+        match value {
+            NotInitialized::Uninitialized => GcError::Remote(value.into()),
+            NotInitialized::Stopped | NotInitialized::ShuttingDown => GcError::TimelineCancelled,
+        }
+    }
+}
+
 impl From<timeline::layer_manager::Shutdown> for GcError {
     fn from(_: timeline::layer_manager::Shutdown) -> Self {
         GcError::TimelineCancelled
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1344fe4192..8a76d7532f 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -985,7 +985,10 @@ impl RemoteTimelineClient {
     ///
     /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`]
     /// is invoked on them.
-    pub(crate) fn schedule_gc_update(self: &Arc<Self>, gc_layers: &[Layer]) -> anyhow::Result<()> {
+    pub(crate) fn schedule_gc_update(
+        self: &Arc<Self>,
+        gc_layers: &[Layer],
+    ) -> Result<(), NotInitialized> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 16ba0fda94..f9d3fdf186 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -369,9 +369,6 @@ impl ImageLayerInner {
         self.lsn
     }
 
-    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
-    /// - inner has the success or transient failure
-    /// - outer has the permanent failure
     pub(super) async fn load(
         path: &Utf8Path,
         lsn: Lsn,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 83450d24bb..0175f32268 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1848,8 +1848,8 @@ impl ResidentLayer {
     /// Read all they keys in this layer which match the ShardIdentity, and write them all to
     /// the provided writer.  Return the number of keys written.
     #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
-    pub(crate) async fn filter<'a>(
-        &'a self,
+    pub(crate) async fn filter(
+        &self,
         shard_identity: &ShardIdentity,
         writer: &mut ImageLayerWriter,
         ctx: &RequestContext,
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index b4706ea59d..713845e9ac 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -211,6 +211,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
             } else {
                 // Run compaction
                 match tenant.compaction_iteration(&cancel, &ctx).await {
+                    Ok(has_pending_task) => {
+                        error_run_count = 0;
+                        // schedule the next compaction immediately in case there is a pending compaction task
+                        if has_pending_task { Duration::ZERO } else { period }
+                    }
                     Err(e) => {
                         let wait_duration = backoff::exponential_backoff_duration_seconds(
                             error_run_count + 1,
@@ -227,11 +232,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                         );
                         wait_duration
                     }
-                    Ok(has_pending_task) => {
-                        error_run_count = 0;
-                        // schedule the next compaction immediately in case there is a pending compaction task
-                        if has_pending_task { Duration::from_secs(0) } else { period }
-                    }
                 }
             };
 
@@ -265,7 +265,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                     count_throttled,
                     sum_throttled_usecs,
                     allowed_rps=%format_args!("{allowed_rps:.0}"),
-                    "shard was throttled in the last n_seconds")
+                    "shard was throttled in the last n_seconds"
+                );
             });
 
             // Sleep
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f810df5a56..b003834adf 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4421,22 +4421,24 @@ impl From<super::upload_queue::NotInitialized> for CompactionError {
     }
 }
 
-impl CompactionError {
-    /// We cannot do compaction because we could not download a layer that is input to the compaction.
-    pub(crate) fn input_layer_download_failed(
-        e: super::storage_layer::layer::DownloadError,
-    ) -> Self {
+impl From<super::storage_layer::layer::DownloadError> for CompactionError {
+    fn from(e: super::storage_layer::layer::DownloadError) -> Self {
         match e {
-            super::storage_layer::layer::DownloadError::TimelineShutdown |
-            /* TODO DownloadCancelled correct here? */
-            super::storage_layer::layer::DownloadError::DownloadCancelled  => CompactionError::ShuttingDown,
-            super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads |
-            super::storage_layer::layer::DownloadError::DownloadRequired |
-            super::storage_layer::layer::DownloadError::NotFile(_) |
-            super::storage_layer::layer::DownloadError::DownloadFailed |
-            super::storage_layer::layer::DownloadError::PreStatFailed(_)=>CompactionError::Other(anyhow::anyhow!(e)),
+            super::storage_layer::layer::DownloadError::TimelineShutdown
+            | super::storage_layer::layer::DownloadError::DownloadCancelled => {
+                CompactionError::ShuttingDown
+            }
+            super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
+            | super::storage_layer::layer::DownloadError::DownloadRequired
+            | super::storage_layer::layer::DownloadError::NotFile(_)
+            | super::storage_layer::layer::DownloadError::DownloadFailed
+            | super::storage_layer::layer::DownloadError::PreStatFailed(_) => {
+                CompactionError::Other(anyhow::anyhow!(e))
+            }
             #[cfg(test)]
-            super::storage_layer::layer::DownloadError::Failpoint(_) =>  CompactionError::Other(anyhow::anyhow!(e)),
+            super::storage_layer::layer::DownloadError::Failpoint(_) => {
+                CompactionError::Other(anyhow::anyhow!(e))
+            }
         }
     }
 }
@@ -4990,15 +4992,7 @@ impl Timeline {
 
             result.layers_removed = gc_layers.len() as u64;
 
-            self.remote_client
-                .schedule_gc_update(&gc_layers)
-                .map_err(|e| {
-                    if self.cancel.is_cancelled() {
-                        GcError::TimelineCancelled
-                    } else {
-                        GcError::Remote(e)
-                    }
-                })?;
+            self.remote_client.schedule_gc_update(&gc_layers)?;
 
             guard.open_mut()?.finish_gc_timeline(&gc_layers);
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 87ec46c0b5..8390cb839c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -489,10 +489,7 @@ impl Timeline {
             // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
             //    - GC, which at worst witnesses us "undelete" a layer that they just deleted.
             //    - ingestion, which only inserts layers, therefore cannot collide with us.
-            let resident = layer
-                .download_and_keep_resident()
-                .await
-                .map_err(CompactionError::input_layer_download_failed)?;
+            let resident = layer.download_and_keep_resident().await?;
 
             let keys_written = resident
                 .filter(&self.shard_identity, &mut image_layer_writer, ctx)
@@ -693,23 +690,14 @@ impl Timeline {
 
         let mut fully_compacted = true;
 
-        deltas_to_compact.push(
-            first_level0_delta
-                .download_and_keep_resident()
-                .await
-                .map_err(CompactionError::input_layer_download_failed)?,
-        );
+        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
         for l in level0_deltas_iter {
             let lsn_range = &l.layer_desc().lsn_range;
 
             if lsn_range.start != prev_lsn_end {
                 break;
             }
-            deltas_to_compact.push(
-                l.download_and_keep_resident()
-                    .await
-                    .map_err(CompactionError::input_layer_download_failed)?,
-            );
+            deltas_to_compact.push(l.download_and_keep_resident().await?);
             deltas_to_compact_bytes += l.metadata().file_size;
             prev_lsn_end = lsn_range.end;
 
@@ -1137,6 +1125,10 @@ impl Timeline {
 
             if !self.shard_identity.is_key_disposable(&key) {
                 if writer.is_none() {
+                    if self.cancel.is_cancelled() {
+                        // to be somewhat responsive to cancellation, check for each new layer
+                        return Err(CompactionError::ShuttingDown);
+                    }
                     // Create writer if not initiaized yet
                     writer = Some(
                         DeltaLayerWriter::new(
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index a66900522a..b5c577af72 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -335,6 +335,9 @@ pub(super) async fn handle_walreceiver_connection(
                             filtered_records += 1;
                         }
 
+                        // FIXME: this cannot be made pausable_failpoint without fixing the
+                        // failpoint library; in tests, the added amount of debugging will cause us
+                        // to timeout the tests.
                         fail_point!("walreceiver-after-ingest");
 
                         last_rec_lsn = lsn;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 4374e74a41..561e8bce04 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -963,7 +963,7 @@ class NeonEnvBuilder:
         if self.env:
             log.info("Cleaning up all storage and compute nodes")
             self.env.stop(
-                immediate=True,
+                immediate=False,
                 # if the test threw an exception, don't check for errors
                 # as a failing assertion would cause the cleanup below to fail
                 ps_assert_metric_no_errors=(exc_type is None),
diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py
index 7e40081aa2..f83b44a7ad 100644
--- a/test_runner/regress/test_ancestor_branch.py
+++ b/test_runner/regress/test_ancestor_branch.py
@@ -20,7 +20,9 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):
         }
     )
 
-    pageserver_http.configure_failpoints(("flush-frozen-pausable", "sleep(10000)"))
+    failpoint = "flush-frozen-pausable"
+
+    pageserver_http.configure_failpoints((failpoint, "sleep(10000)"))
 
     endpoint_branch0 = env.endpoints.create_start("main", tenant_id=tenant)
     branch0_cur = endpoint_branch0.connect().cursor()
@@ -96,3 +98,5 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):
     assert query_scalar(branch1_cur, "SELECT count(*) FROM foo") == 200000
 
     assert query_scalar(branch2_cur, "SELECT count(*) FROM foo") == 300000
+
+    pageserver_http.configure_failpoints((failpoint, "off"))
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 1f220eec9e..642b9e449b 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -1137,3 +1137,10 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
         delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True)
     else:
         raise RuntimeError(activation_method)
+
+    client.configure_failpoints(
+        [
+            ("timeline-calculate-logical-size-pause", "off"),
+            ("walreceiver-after-ingest", "off"),
+        ]
+    )

From ae527ef088ef1654854c0cbd9b4cc9ab3878619e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 12 Aug 2024 13:58:46 +0100
Subject: [PATCH 14/62] storcon: implement graceful leadership transfer (#8588)

## Problem
Storage controller restarts cause temporary unavailability from the
control plane POV. See RFC for more details.

## Summary of changes
* A couple of small refactors of the storage controller start-up
sequence to make extending it easier.
* A leader table is added to track the storage controller instance
that's currently the leader (if any)
* A peer client is added such that storage controllers can send
`step_down` requests to each other (implemented in
https://github.com/neondatabase/neon/pull/8512).
* Implement the leader cut-over as described in the RFC
* Add `start-as-candidate` flag to the storage controller to gate the
rolling restart behaviour. When the flag is `false` (the default), the
only change from the current start-up sequence is persisting the leader
entry to the database.
---
 .../2024-07-26-140924_create_leader/down.sql  |   1 +
 .../2024-07-26-140924_create_leader/up.sql    |   5 +
 storage_controller/src/lib.rs                 |   1 +
 storage_controller/src/main.rs                |  11 +
 storage_controller/src/metrics.rs             |  16 ++
 storage_controller/src/peer_client.rs         | 106 +++++++
 storage_controller/src/persistence.rs         |  74 +++++
 storage_controller/src/schema.rs              |   9 +-
 storage_controller/src/service.rs             | 261 +++++++++++++-----
 9 files changed, 407 insertions(+), 77 deletions(-)
 create mode 100644 storage_controller/migrations/2024-07-26-140924_create_leader/down.sql
 create mode 100644 storage_controller/migrations/2024-07-26-140924_create_leader/up.sql
 create mode 100644 storage_controller/src/peer_client.rs

diff --git a/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql b/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql
new file mode 100644
index 0000000000..53222c614e
--- /dev/null
+++ b/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql
@@ -0,0 +1 @@
+DROP TABLE controllers;
diff --git a/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql b/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql
new file mode 100644
index 0000000000..90546948cb
--- /dev/null
+++ b/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql
@@ -0,0 +1,5 @@
+CREATE TABLE controllers (
+  address VARCHAR NOT NULL,
+  started_at TIMESTAMPTZ NOT NULL,
+  PRIMARY KEY(address, started_at)
+);
diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs
index 26c258c466..2034addbe1 100644
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -11,6 +11,7 @@ mod id_lock_map;
 pub mod metrics;
 mod node;
 mod pageserver_client;
+mod peer_client;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index a66e9128bc..5a68799141 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -1,6 +1,7 @@
 use anyhow::{anyhow, Context};
 use clap::Parser;
 use diesel::Connection;
+use hyper::Uri;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
 use std::path::PathBuf;
@@ -83,6 +84,13 @@ struct Cli {
     #[arg(long, default_value = "5s")]
     db_connect_timeout: humantime::Duration,
 
+    #[arg(long, default_value = "false")]
+    start_as_candidate: bool,
+
+    // TODO: make this mandatory once the helm chart gets updated
+    #[arg(long)]
+    address_for_peers: Option<Uri>,
+
     /// `neon_local` sets this to the path of the neon_local repo dir.
     /// Only relevant for testing.
     // TODO: make `cfg(feature = "testing")`
@@ -285,6 +293,9 @@ async fn async_main() -> anyhow::Result<()> {
         split_threshold: args.split_threshold,
         neon_local_repo_dir: args.neon_local_repo_dir,
         max_secondary_lag_bytes: args.max_secondary_lag_bytes,
+        address_for_peers: args.address_for_peers,
+        start_as_candidate: args.start_as_candidate,
+        http_service_port: args.listen.port() as i32,
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index a1a4b8543d..c2303e7a7f 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -12,6 +12,7 @@ use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, Metr
 use metrics::NeonMetrics;
 use once_cell::sync::Lazy;
 use std::sync::Mutex;
+use strum::IntoEnumIterator;
 
 use crate::{
     persistence::{DatabaseError, DatabaseOperation},
@@ -241,3 +242,18 @@ impl DatabaseError {
         }
     }
 }
+
+/// Update the leadership status metric gauges to reflect the requested status
+pub(crate) fn update_leadership_status(status: LeadershipStatus) {
+    let status_metric = &METRICS_REGISTRY
+        .metrics_group
+        .storage_controller_leadership_status;
+
+    for s in LeadershipStatus::iter() {
+        if s == status {
+            status_metric.set(LeadershipStatusGroup { status: s }, 1);
+        } else {
+            status_metric.set(LeadershipStatusGroup { status: s }, 0);
+        }
+    }
+}
diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs
new file mode 100644
index 0000000000..ebb59a1720
--- /dev/null
+++ b/storage_controller/src/peer_client.rs
@@ -0,0 +1,106 @@
+use crate::tenant_shard::ObservedState;
+use pageserver_api::shard::TenantShardId;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use tokio_util::sync::CancellationToken;
+
+use hyper::Uri;
+use reqwest::{StatusCode, Url};
+use utils::{backoff, http::error::HttpErrorBody};
+
+#[derive(Debug, Clone)]
+pub(crate) struct PeerClient {
+    uri: Uri,
+    jwt: Option<String>,
+    client: reqwest::Client,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum StorageControllerPeerError {
+    #[error("failed to deserialize error response with status code {0} at {1}: {2}")]
+    DeserializationError(StatusCode, Url, reqwest::Error),
+    #[error("storage controller peer API error ({0}): {1}")]
+    ApiError(StatusCode, String),
+    #[error("failed to send HTTP request: {0}")]
+    SendError(reqwest::Error),
+    #[error("Cancelled")]
+    Cancelled,
+}
+
+pub(crate) type Result<T> = std::result::Result<T, StorageControllerPeerError>;
+
+pub(crate) trait ResponseErrorMessageExt: Sized {
+    fn error_from_body(self) -> impl std::future::Future<Output = Result<Self>> + Send;
+}
+
+impl ResponseErrorMessageExt for reqwest::Response {
+    async fn error_from_body(self) -> Result<Self> {
+        let status = self.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            return Ok(self);
+        }
+
+        let url = self.url().to_owned();
+        Err(match self.json::<HttpErrorBody>().await {
+            Ok(HttpErrorBody { msg }) => StorageControllerPeerError::ApiError(status, msg),
+            Err(err) => StorageControllerPeerError::DeserializationError(status, url, err),
+        })
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Default)]
+pub(crate) struct GlobalObservedState(pub(crate) HashMap<TenantShardId, ObservedState>);
+
+impl PeerClient {
+    pub(crate) fn new(uri: Uri, jwt: Option<String>) -> Self {
+        Self {
+            uri,
+            jwt,
+            client: reqwest::Client::new(),
+        }
+    }
+
+    async fn request_step_down(&self) -> Result<GlobalObservedState> {
+        let step_down_path = format!("{}control/v1/step_down", self.uri);
+        let req = self.client.put(step_down_path);
+        let req = if let Some(jwt) = &self.jwt {
+            req.header(reqwest::header::AUTHORIZATION, format!("Bearer {jwt}"))
+        } else {
+            req
+        };
+
+        let res = req
+            .send()
+            .await
+            .map_err(StorageControllerPeerError::SendError)?;
+        let response = res.error_from_body().await?;
+
+        let status = response.status();
+        let url = response.url().to_owned();
+
+        response
+            .json()
+            .await
+            .map_err(|err| StorageControllerPeerError::DeserializationError(status, url, err))
+    }
+
+    /// Request the peer to step down and return its current observed state
+    /// All errors are retried with exponential backoff for a maximum of 4 attempts.
+    /// Assuming all retries are performed, the function times out after roughly 4 seconds.
+    pub(crate) async fn step_down(
+        &self,
+        cancel: &CancellationToken,
+    ) -> Result<GlobalObservedState> {
+        backoff::retry(
+            || self.request_step_down(),
+            |_e| false,
+            2,
+            4,
+            "Send step down request",
+            cancel,
+        )
+        .await
+        .ok_or_else(|| StorageControllerPeerError::Cancelled)
+        .and_then(|x| x)
+    }
+}
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 64a3e597ce..aebbdec0d1 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -95,6 +95,8 @@ pub(crate) enum DatabaseOperation {
     ListMetadataHealth,
     ListMetadataHealthUnhealthy,
     ListMetadataHealthOutdated,
+    GetLeader,
+    UpdateLeader,
 }
 
 #[must_use]
@@ -785,6 +787,69 @@ impl Persistence {
         )
         .await
     }
+
+    /// Get the current entry from the `leader` table if one exists.
+    /// It is an error for the table to contain more than one entry.
+    pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
+        let mut leader: Vec<ControllerPersistence> = self
+            .with_measured_conn(
+                DatabaseOperation::GetLeader,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::controllers::table.load::<ControllerPersistence>(conn)?)
+                },
+            )
+            .await?;
+
+        if leader.len() > 1 {
+            return Err(DatabaseError::Logical(format!(
+                "More than one entry present in the leader table: {leader:?}"
+            )));
+        }
+
+        Ok(leader.pop())
+    }
+
+    /// Update the new leader with compare-exchange semantics. If `prev` does not
+    /// match the current leader entry, then the update is treated as a failure.
+    /// When `prev` is not specified, the update is forced.
+    pub(crate) async fn update_leader(
+        &self,
+        prev: Option<ControllerPersistence>,
+        new: ControllerPersistence,
+    ) -> DatabaseResult<()> {
+        use crate::schema::controllers::dsl::*;
+
+        let updated = self
+            .with_measured_conn(
+                DatabaseOperation::UpdateLeader,
+                move |conn| -> DatabaseResult<usize> {
+                    let updated = match &prev {
+                        Some(prev) => diesel::update(controllers)
+                            .filter(address.eq(prev.address.clone()))
+                            .filter(started_at.eq(prev.started_at))
+                            .set((
+                                address.eq(new.address.clone()),
+                                started_at.eq(new.started_at),
+                            ))
+                            .execute(conn)?,
+                        None => diesel::insert_into(controllers)
+                            .values(new.clone())
+                            .execute(conn)?,
+                    };
+
+                    Ok(updated)
+                },
+            )
+            .await?;
+
+        if updated == 0 {
+            return Err(DatabaseError::Logical(
+                "Leader table update failed".to_string(),
+            ));
+        }
+
+        Ok(())
+    }
 }
 
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -910,3 +975,12 @@ impl From<MetadataHealthPersistence> for MetadataHealthRecord {
         }
     }
 }
+
+#[derive(
+    Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq, Debug, Clone,
+)]
+#[diesel(table_name = crate::schema::controllers)]
+pub(crate) struct ControllerPersistence {
+    pub(crate) address: String,
+    pub(crate) started_at: chrono::DateTime<chrono::Utc>,
+}
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index cb5ba3f38b..77ba47e114 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -1,5 +1,12 @@
 // @generated automatically by Diesel CLI.
 
+diesel::table! {
+    controllers (address, started_at) {
+        address -> Varchar,
+        started_at -> Timestamptz,
+    }
+}
+
 diesel::table! {
     metadata_health (tenant_id, shard_number, shard_count) {
         tenant_id -> Varchar,
@@ -36,4 +43,4 @@ diesel::table! {
     }
 }
 
-diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,);
+diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,);
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 31b2d0c3f5..fe582cf0e2 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1,3 +1,4 @@
+use hyper::Uri;
 use std::{
     borrow::Cow,
     cmp::Ordering,
@@ -16,8 +17,11 @@ use crate::{
     compute_hook::NotifyError,
     drain_utils::{self, TenantShardDrain, TenantShardIterator},
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
-    metrics::LeadershipStatusGroup,
-    persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
+    metrics,
+    peer_client::{GlobalObservedState, PeerClient},
+    persistence::{
+        AbortShardSplitStatus, ControllerPersistence, MetadataHealthPersistence, TenantFilter,
+    },
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
     scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
     tenant_shard::{
@@ -83,7 +87,6 @@ use crate::{
         ReconcilerWaiter, TenantShard,
     },
 };
-use serde::{Deserialize, Serialize};
 
 pub mod chaos_injector;
 
@@ -140,7 +143,15 @@ enum NodeOperations {
 /// Allowed transitions are:
 /// 1. Leader -> SteppedDown
 /// 2. Candidate -> Leader
-#[derive(Copy, Clone, strum_macros::Display, measured::FixedCardinalityLabel)]
+#[derive(
+    Eq,
+    PartialEq,
+    Copy,
+    Clone,
+    strum_macros::Display,
+    strum_macros::EnumIter,
+    measured::FixedCardinalityLabel,
+)]
 #[strum(serialize_all = "snake_case")]
 pub(crate) enum LeadershipStatus {
     /// This is the steady state where the storage controller can produce
@@ -226,22 +237,12 @@ impl ServiceState {
         tenants: BTreeMap<TenantShardId, TenantShard>,
         scheduler: Scheduler,
         delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
+        initial_leadership_status: LeadershipStatus,
     ) -> Self {
-        let status = &crate::metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_leadership_status;
-
-        status.set(
-            LeadershipStatusGroup {
-                status: LeadershipStatus::Leader,
-            },
-            1,
-        );
+        metrics::update_leadership_status(initial_leadership_status);
 
         Self {
-            // TODO: Starting up as Leader is a transient state. Once we enable rolling
-            // upgrades on the k8s side, we should start up as Candidate.
-            leadership_status: LeadershipStatus::Leader,
+            leadership_status: initial_leadership_status,
             tenants,
             nodes: Arc::new(nodes),
             scheduler,
@@ -266,29 +267,12 @@ impl ServiceState {
 
     fn step_down(&mut self) {
         self.leadership_status = LeadershipStatus::SteppedDown;
+        metrics::update_leadership_status(self.leadership_status);
+    }
 
-        let status = &crate::metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_leadership_status;
-
-        status.set(
-            LeadershipStatusGroup {
-                status: LeadershipStatus::SteppedDown,
-            },
-            1,
-        );
-        status.set(
-            LeadershipStatusGroup {
-                status: LeadershipStatus::Leader,
-            },
-            0,
-        );
-        status.set(
-            LeadershipStatusGroup {
-                status: LeadershipStatus::Candidate,
-            },
-            0,
-        );
+    fn become_leader(&mut self) {
+        self.leadership_status = LeadershipStatus::Leader;
+        metrics::update_leadership_status(self.leadership_status);
     }
 }
 
@@ -332,6 +316,12 @@ pub struct Config {
     // by more than the configured amount, then the secondary is not
     // upgraded to primary.
     pub max_secondary_lag_bytes: Option<u64>,
+
+    pub address_for_peers: Option<Uri>,
+
+    pub start_as_candidate: bool,
+
+    pub http_service_port: i32,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -499,9 +489,10 @@ pub(crate) enum ReconcileResultRequest {
     Stop,
 }
 
-// TODO: move this into the storcon peer client when that gets added
-#[derive(Serialize, Deserialize, Debug, Default)]
-pub(crate) struct GlobalObservedState(HashMap<TenantShardId, ObservedState>);
+struct LeaderStepDownState {
+    observed: GlobalObservedState,
+    leader: ControllerPersistence,
+}
 
 impl Service {
     pub fn get_config(&self) -> &Config {
@@ -513,15 +504,11 @@ impl Service {
     #[instrument(skip_all)]
     async fn startup_reconcile(
         self: &Arc<Service>,
+        leader_step_down_state: Option<LeaderStepDownState>,
         bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
             Result<(), (TenantShardId, NotifyError)>,
         >,
     ) {
-        // For all tenant shards, a vector of observed states on nodes (where None means
-        // indeterminate, same as in [`ObservedStateLocation`])
-        let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
-            HashMap::new();
-
         // Startup reconciliation does I/O to other services: whether they
         // are responsive or not, we should aim to finish within our deadline, because:
         // - If we don't, a k8s readiness hook watching /ready will kill us.
@@ -535,26 +522,28 @@ impl Service {
             .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
             .expect("Reconcile timeout is a modest constant");
 
+        let (observed, current_leader) = if let Some(state) = leader_step_down_state {
+            tracing::info!(
+                "Using observed state received from leader at {}",
+                state.leader.address,
+            );
+            (state.observed, Some(state.leader))
+        } else {
+            (
+                self.build_global_observed_state(node_scan_deadline).await,
+                None,
+            )
+        };
+
         // Accumulate a list of any tenant locations that ought to be detached
         let mut cleanup = Vec::new();
 
-        let node_listings = self.scan_node_locations(node_scan_deadline).await;
-        // Send initial heartbeat requests to nodes that replied to the location listing above.
-        let nodes_online = self.initial_heartbeat_round(node_listings.keys()).await;
-
-        for (node_id, list_response) in node_listings {
-            let tenant_shards = list_response.tenant_shards;
-            tracing::info!(
-                "Received {} shard statuses from pageserver {}, setting it to Active",
-                tenant_shards.len(),
-                node_id
-            );
-
-            for (tenant_shard_id, conf_opt) in tenant_shards {
-                let shard_observations = observed.entry(tenant_shard_id).or_default();
-                shard_observations.push((node_id, conf_opt));
-            }
-        }
+        // Send initial heartbeat requests to all nodes loaded from the database
+        let all_nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };
+        let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
 
         // List of tenants for which we will attempt to notify compute of their location at startup
         let mut compute_notifications = Vec::new();
@@ -577,17 +566,16 @@ impl Service {
             }
             *nodes = Arc::new(new_nodes);
 
-            for (tenant_shard_id, shard_observations) in observed {
-                for (node_id, observed_loc) in shard_observations {
-                    let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
-                        cleanup.push((tenant_shard_id, node_id));
-                        continue;
-                    };
-                    tenant_shard
-                        .observed
-                        .locations
-                        .insert(node_id, ObservedStateLocation { conf: observed_loc });
-                }
+            for (tenant_shard_id, observed_state) in observed.0 {
+                let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
+                    for node_id in observed_state.locations.keys() {
+                        cleanup.push((tenant_shard_id, *node_id));
+                    }
+
+                    continue;
+                };
+
+                tenant_shard.observed = observed_state;
             }
 
             // Populate each tenant's intent state
@@ -621,6 +609,28 @@ impl Service {
             tenants.len()
         };
 
+        // Before making any obeservable changes to the cluster, persist self
+        // as leader in database and memory.
+        if let Some(address_for_peers) = &self.config.address_for_peers {
+            // TODO: `address-for-peers` can become a mandatory cli arg
+            // after we update the k8s setup
+            let proposed_leader = ControllerPersistence {
+                address: address_for_peers.to_string(),
+                started_at: chrono::Utc::now(),
+            };
+
+            if let Err(err) = self
+                .persistence
+                .update_leader(current_leader, proposed_leader)
+                .await
+            {
+                tracing::error!("Failed to persist self as leader: {err}. Aborting start-up ...");
+                std::process::exit(1);
+            }
+        }
+
+        self.inner.write().unwrap().become_leader();
+
         // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
         // generation_pageserver in the database.
 
@@ -786,6 +796,31 @@ impl Service {
         node_results
     }
 
+    async fn build_global_observed_state(&self, deadline: Instant) -> GlobalObservedState {
+        let node_listings = self.scan_node_locations(deadline).await;
+        let mut observed = GlobalObservedState::default();
+
+        for (node_id, location_confs) in node_listings {
+            tracing::info!(
+                "Received {} shard statuses from pageserver {}",
+                location_confs.tenant_shards.len(),
+                node_id
+            );
+
+            for (tid, location_conf) in location_confs.tenant_shards {
+                let entry = observed.0.entry(tid).or_default();
+                entry.locations.insert(
+                    node_id,
+                    ObservedStateLocation {
+                        conf: location_conf,
+                    },
+                );
+            }
+        }
+
+        observed
+    }
+
     /// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers.
     ///
     /// This is safe to run in the background, because if we don't have this TenantShardId in our map of
@@ -1264,12 +1299,20 @@ impl Service {
             config.max_warming_up_interval,
             cancel.clone(),
         );
+
+        let initial_leadership_status = if config.start_as_candidate {
+            LeadershipStatus::Candidate
+        } else {
+            LeadershipStatus::Leader
+        };
+
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
                 nodes,
                 tenants,
                 scheduler,
                 delayed_reconcile_rx,
+                initial_leadership_status,
             ))),
             config: config.clone(),
             persistence,
@@ -1338,7 +1381,16 @@ impl Service {
                     return;
                 };
 
-                this.startup_reconcile(bg_compute_notify_result_tx).await;
+                let leadership_status = this.inner.read().unwrap().get_leadership_status();
+                let peer_observed_state = match leadership_status {
+                    LeadershipStatus::Candidate => this.request_step_down().await,
+                    LeadershipStatus::Leader => None,
+                    LeadershipStatus::SteppedDown => unreachable!(),
+                };
+
+                this.startup_reconcile(peer_observed_state, bg_compute_notify_result_tx)
+                    .await;
+
                 drop(startup_completion);
             }
         });
@@ -6285,4 +6337,61 @@ impl Service {
 
         global_observed
     }
+
+    /// Request step down from the currently registered leader in the database
+    ///
+    /// If such an entry is persisted, the success path returns the observed
+    /// state and details of the leader. Otherwise, None is returned indicating
+    /// there is no leader currently.
+    ///
+    /// On failures to query the database or step down error responses the process is killed
+    /// and we rely on k8s to retry.
+    async fn request_step_down(&self) -> Option<LeaderStepDownState> {
+        let leader = match self.persistence.get_leader().await {
+            Ok(leader) => leader,
+            Err(err) => {
+                tracing::error!(
+                    "Failed to query database for current leader: {err}. Aborting start-up ..."
+                );
+                std::process::exit(1);
+            }
+        };
+
+        match leader {
+            Some(leader) => {
+                tracing::info!("Sending step down request to {leader:?}");
+
+                // TODO: jwt token
+                let client = PeerClient::new(
+                    Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
+                    self.config.jwt_token.clone(),
+                );
+                let state = client.step_down(&self.cancel).await;
+                match state {
+                    Ok(state) => Some(LeaderStepDownState {
+                        observed: state,
+                        leader: leader.clone(),
+                    }),
+                    Err(err) => {
+                        // TODO: Make leaders periodically update a timestamp field in the
+                        // database and, if the leader is not reachable from the current instance,
+                        // but inferred as alive from the timestamp, abort start-up. This avoids
+                        // a potential scenario in which we have two controllers acting as leaders.
+                        tracing::error!(
+                            "Leader ({}) did not respond to step-down request: {}",
+                            leader.address,
+                            err
+                        );
+                        None
+                    }
+                }
+            }
+            None => {
+                tracing::info!(
+                    "No leader found to request step down from. Will build observed state."
+                );
+                None
+            }
+        }
+    }
 }

From ce0d0a204ce9f77d6f7fe23b2bd2f393a75c7b6b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 12 Aug 2024 19:15:48 +0200
Subject: [PATCH 15/62] fix(walredo): shutdown can complete too early (#8701)

Problem
-------

The following race is possible today:

```
walredo_extraordinary_shutdown_thread: shutdown gets until Poll::Pending of self.launched_processes.close().await call

other thread: drops the last Arc<Process>
  = 1. drop(_launched_processes_guard) runs, this ...

walredo_extraordinary_shutdown_thread: ... wakes self.launched_processes.close().await

walredo_extraordinary_shutdown_thread: logs `done`

other thread:
  = 2. drop(process): this kill & waits
```

Solution
--------

Change drop order so that `process` gets dropped first.

Context
-------


https://neondb.slack.com/archives/C06Q661FA4C/p1723478188785719?thread_ts=1723456706.465789&cid=C06Q661FA4C

refs https://github.com/neondatabase/neon/pull/8572
refs https://github.com/neondatabase/cloud/issues/11387
---
 pageserver/src/walredo.rs | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 770081b3b4..82585f9ed8 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -107,8 +107,10 @@ enum ProcessOnceCell {
 }
 
 struct Process {
-    _launched_processes_guard: utils::sync::gate::GateGuard,
     process: process::WalRedoProcess,
+    /// This field is last in this struct so the guard gets dropped _after_ [`Self::process`].
+    /// (Reminder: dropping [`Self::process`] synchronously sends SIGKILL and then `wait()`s for it to exit).
+    _launched_processes_guard: utils::sync::gate::GateGuard,
 }
 
 impl std::ops::Deref for Process {
@@ -327,20 +329,23 @@ impl PostgresRedoManager {
                 },
                 Err(permit) => {
                     let start = Instant::now();
-                    let proc = Arc::new(Process {
-                            _launched_processes_guard: match self.launched_processes.enter() {
+                    // acquire guard before spawning process, so that we don't spawn new processes
+                    // if the gate is already closed.
+                    let _launched_processes_guard = match self.launched_processes.enter() {
                                 Ok(guard) => guard,
                                 Err(GateError::GateClosed) => unreachable!(
                                     "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
                                 ),
-                            },
-                            process: process::WalRedoProcess::launch(
-                                self.conf,
-                                self.tenant_shard_id,
-                                pg_version,
-                            )
-                            .context("launch walredo process")?,
-                        });
+                            };
+                    let proc = Arc::new(Process {
+                        process: process::WalRedoProcess::launch(
+                            self.conf,
+                            self.tenant_shard_id,
+                            pg_version,
+                        )
+                        .context("launch walredo process")?,
+                        _launched_processes_guard,
+                    });
                     let duration = start.elapsed();
                     WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
                     info!(

From f57c2fe8fbb19da0fa4c9d21cf73e000981c3bad Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 12 Aug 2024 21:46:35 +0200
Subject: [PATCH 16/62] Automatically prepare/restore Aurora and RDS databases
 from pg_dump in benchmarking workflow (#8682)

## Problem

We use infrastructure as code (TF) to deploy AWS Aurora and AWS RDS
Postgres database clusters.
Whenever we have a change in TF (e.g. **every year** to upgrade to a
higher Postgres version or when we change the cluster configuration) TF
will apply the change and create a new AWS database cluster.

However our benchmarking testcase also expects databases in these
clusters and tables loaded with data.
So we add auto-detection - if the AWS RDS instances are "empty" we
create the necessary databases and restore a pg_dump.

**Important Notes:**

- These steps are NOT run in each benchmarking run, but only after a new
RDS instance has been deployed.
- the benchmarking workflows use GitHub secrets to find the connection
string for the database. These secrets still need to be (manually or
programmatically using git cli) updated if some port of the connection
string (e.g. user, password or hostname) changes.

## Summary of changes

In each benchmarking run check if
- database has already been created - if not create it
- database has already been restored - if not restore it

Supported databases
- tpch
- clickbench
- user example

Supported platforms:
- AWS RDS Postgres
- AWS Aurora serverless Postgres

Sample workflow run - but this one uses Neon database to test the
restore step and not real AWS databases


https://github.com/neondatabase/neon/actions/runs/10321441086/job/28574350581

Sample workflow run - with real AWS database clusters

https://github.com/neondatabase/neon/actions/runs/10346816389/job/28635997653

Verification in second run - with real AWS database clusters - that
second time the restore is skipped

https://github.com/neondatabase/neon/actions/runs/10348469517/job/28640778223
---
 .../workflows/_benchmarking_preparation.yml   | 149 ++++++++++++++++++
 .github/workflows/benchmarking.yml            |  27 ++--
 2 files changed, 166 insertions(+), 10 deletions(-)
 create mode 100644 .github/workflows/_benchmarking_preparation.yml

diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml
new file mode 100644
index 0000000000..0f540afab7
--- /dev/null
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -0,0 +1,149 @@
+name: Prepare benchmarking databases by restoring dumps
+
+on:
+  workflow_call:
+    # no inputs needed
+    
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+jobs:
+  setup-databases:
+    strategy:
+      fail-fast: false
+      matrix:
+        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres ] 
+        database: [ clickbench, tpch, userexample ]
+  
+    env:
+      LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib
+      PLATFORM: ${{ matrix.platform }}
+      PG_BINARIES: /tmp/neon/pg_install/v16/bin
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
+    steps:
+    - name: Set up Connection String
+      id: set-up-prep-connstr
+      run: |
+        case "${PLATFORM}" in
+          aws-rds-postgres)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} 
+            ;;
+          aws-aurora-serverless-v2-postgres)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} 
+            ;;
+          *)
+            echo >&2 "Unknown PLATFORM=${PLATFORM}"
+            exit 1
+            ;;
+        esac
+
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT  
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    # we create a table that has one row for each database that we want to restore with the status whether the restore is done    
+    - name: Create benchmark_restore_status table if it does not exist
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
+        DATABASE_NAME: ${{ matrix.database }}
+      # to avoid a race condition of multiple jobs trying to create the table at the same time, 
+      # we use an advisory lock
+      run: |
+        ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
+        SELECT pg_advisory_lock(4711);  
+        CREATE TABLE IF NOT EXISTS benchmark_restore_status (
+        databasename text primary key,
+        restore_done boolean
+        );
+        SELECT pg_advisory_unlock(4711);
+        "
+    
+    - name: Check if restore is already done
+      id: check-restore-done
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
+        DATABASE_NAME: ${{ matrix.database }}
+      run: |
+        skip=false
+        if ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM benchmark_restore_status WHERE databasename='${{ env.DATABASE_NAME }}' AND restore_done=true;" | grep -q 1; then
+          echo "Restore already done for database ${{ env.DATABASE_NAME }} on platform ${{ env.PLATFORM }}. Skipping this database."
+          skip=true
+        fi
+        echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
+
+    - name: Check and create database if it does not exist
+      if: steps.check-restore-done.outputs.skip != 'true'
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
+        DATABASE_NAME: ${{ matrix.database }}
+      run: |
+        DB_EXISTS=$(${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM pg_database WHERE datname='${{ env.DATABASE_NAME }}'")
+        if [ "$DB_EXISTS" != "1" ]; then
+          echo "Database ${{ env.DATABASE_NAME }} does not exist. Creating it..."
+          ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "CREATE DATABASE \"${{ env.DATABASE_NAME }}\";"
+        else
+          echo "Database ${{ env.DATABASE_NAME }} already exists."
+        fi
+
+    - name: Download dump from S3 to /tmp/dumps
+      if: steps.check-restore-done.outputs.skip != 'true'
+      env:
+        DATABASE_NAME: ${{ matrix.database }}
+      run: |
+        mkdir -p /tmp/dumps
+        aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/ 
+
+    - name: Replace database name in connection string
+      if: steps.check-restore-done.outputs.skip != 'true'
+      id: replace-dbname
+      env:
+        DATABASE_NAME: ${{ matrix.database }}
+        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
+      run: |
+        # Extract the part before the database name
+        base_connstr="${BENCHMARK_CONNSTR%/*}"
+        # Extract the query parameters (if any) after the database name
+        query_params="${BENCHMARK_CONNSTR#*\?}"
+        # Reconstruct the new connection string
+        if [ "$query_params" != "$BENCHMARK_CONNSTR" ]; then
+          new_connstr="${base_connstr}/${DATABASE_NAME}?${query_params}"
+        else
+          new_connstr="${base_connstr}/${DATABASE_NAME}"
+        fi
+        echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT  
+
+    - name: Restore dump
+      if: steps.check-restore-done.outputs.skip != 'true'
+      env:
+        DATABASE_NAME: ${{ matrix.database }}
+        DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }}
+        # the following works only with larger computes: 
+        # PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
+        # we add the || true because:
+        # the dumps were created with Neon and contain neon extensions that are not 
+        # available in RDS, so we will always report an error, but we can ignore it
+      run: |
+        ${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \
+        -d "${DATABASE_CONNSTR}" /tmp/dumps/${DATABASE_NAME}.pg_dump || true
+
+    - name: Update benchmark_restore_status table
+      if: steps.check-restore-done.outputs.skip != 'true'
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
+        DATABASE_NAME: ${{ matrix.database }}
+      run: |
+        ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
+        INSERT INTO benchmark_restore_status (databasename, restore_done) VALUES ('${{ env.DATABASE_NAME }}', true)
+        ON CONFLICT (databasename) DO UPDATE SET restore_done = true;
+        "
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 6f80d6e431..106c3e3138 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -280,8 +280,9 @@ jobs:
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
         }'
 
-        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
+        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                                                     { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -321,9 +322,13 @@ jobs:
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
 
+  prepare_AWS_RDS_databases:
+    uses: ./.github/workflows/_benchmarking_preparation.yml
+    secrets: inherit
+  
   pgbench-compare:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
-    needs: [ generate-matrices ]
+    needs: [ generate-matrices, prepare_AWS_RDS_databases ]
     permissions:
       contents: write
       statuses: write
@@ -595,7 +600,7 @@ jobs:
     # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
     # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
     if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    needs: [ generate-matrices, pgbench-compare ]
+    needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ]
 
     strategy:
       fail-fast: false
@@ -603,7 +608,7 @@ jobs:
 
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: 16
       TEST_OUTPUT: /tmp/test_output
       TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
       TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
@@ -655,6 +660,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -684,7 +690,7 @@ jobs:
     #
     # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
     if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    needs: [ generate-matrices, clickbench-compare ]
+    needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ]
 
     strategy:
       fail-fast: false
@@ -692,7 +698,7 @@ jobs:
 
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: 16
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -724,7 +730,7 @@ jobs:
             ENV_PLATFORM=RDS_AURORA_TPCH
             ;;
           rds-postgres)
-            ENV_PLATFORM=RDS_AURORA_TPCH
+            ENV_PLATFORM=RDS_POSTGRES_TPCH
             ;;
           *)
             echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
@@ -750,6 +756,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_tpch
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -771,7 +778,7 @@ jobs:
 
   user-examples-compare:
     if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    needs: [ generate-matrices, tpch-compare ]
+    needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ]
 
     strategy:
       fail-fast: false
@@ -779,7 +786,7 @@ jobs:
 
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: 16
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}

From 32aa1fc68151a7393801447e2f33688d78b07ea1 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Mon, 12 Aug 2024 21:54:42 -0700
Subject: [PATCH 17/62] Add on-demand WAL download to slot funcs (#8705)

## Problem
Currently we can have an issue where if someone does
`pg_logical_slot_advance`, it could fail because it doesn't have the WAL
locally.

## Summary of changes
Adds on-demand WAL download and a test to these slot funcs. Before
adding these, the test fails with
```
requested WAL segment pg_wal/000000010000000000000001 has already been removed
```
After the changes, the test passes


Relies on:
- https://github.com/neondatabase/postgres/pull/466
- https://github.com/neondatabase/postgres/pull/467
- https://github.com/neondatabase/postgres/pull/468
---
 pgxn/neon/neon.c                                |  1 +
 test_runner/regress/test_logical_replication.py | 15 +++++++++++++++
 vendor/postgres-v14                             |  2 +-
 vendor/postgres-v15                             |  2 +-
 vendor/postgres-v16                             |  2 +-
 vendor/revisions.json                           |  6 +++---
 6 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index d107cdc1c2..9cdbf4a126 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -599,6 +599,7 @@ _PG_init(void)
 	pg_init_walproposer();
 	WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 	LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
+	SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 
 	InitLogicalReplicationMonitor();
 
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 5a5d369a11..0d18aa43b7 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -255,6 +255,21 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
         cur.execute(
             "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')"
         )
+        cur.execute(
+            """
+INSERT INTO wal_generator (data)
+SELECT repeat('A', 1024) -- Generates a kilobyte of data per row
+FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data
+"""
+        )
+
+    endpoint.stop_and_destroy()
+    endpoint = env.endpoints.create_start("init")
+    with endpoint.connect().cursor() as cur:
+        log.info("advance slot")
+        cur.execute(
+            "SELECT * from pg_replication_slot_advance('slotty_mcslotface', pg_current_wal_lsn())"
+        )
 
 
 # Tests that walsender correctly blocks until WAL is downloaded from safekeepers
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index ae07734e0f..a48faca1d9 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit ae07734e0ff72759ab425fc8f625d4c1ecb15a50
+Subproject commit a48faca1d9aef59649dd1bf34bc1b6303fa3489e
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 47c8d462d1..39c51c33b3 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 47c8d462d169367c8979ce628a523be2d94b46be
+Subproject commit 39c51c33b383239c78b86afe561679f980e44842
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 6434b1499b..5ea106b258 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 6434b1499b11ed97dccea5618a055034b83b8e2f
+Subproject commit 5ea106b2583285849784e774b39d62eb2615bd5d
diff --git a/vendor/revisions.json b/vendor/revisions.json
index ab8b3b3c4f..f983407268 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,14 +1,14 @@
 {
   "v16": [
     "16.3",
-    "6434b1499b11ed97dccea5618a055034b83b8e2f"
+    "5ea106b2583285849784e774b39d62eb2615bd5d"
   ],
   "v15": [
     "15.7",
-    "47c8d462d169367c8979ce628a523be2d94b46be"
+    "39c51c33b383239c78b86afe561679f980e44842"
   ],
   "v14": [
     "14.12",
-    "ae07734e0ff72759ab425fc8f625d4c1ecb15a50"
+    "a48faca1d9aef59649dd1bf34bc1b6303fa3489e"
   ]
 }

From d24f1b6c044150013a020b9856186b9dba5f6c28 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 12 Aug 2024 19:28:03 +0300
Subject: [PATCH 18/62] Allow logical_replication_max_snap_files = -1

which disables the mechanism.
---
 pgxn/neon/neon.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 9cdbf4a126..6afca61fae 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -68,10 +68,10 @@ InitLogicalReplicationMonitor(void)
 
 	DefineCustomIntVariable(
 							"neon.logical_replication_max_snap_files",
-							"Maximum allowed logical replication .snap files",
+							"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
 							NULL,
 							&logical_replication_max_snap_files,
-							300, 0, INT_MAX,
+							300, -1, INT_MAX,
 							PGC_SIGHUP,
 							0,
 							NULL, NULL, NULL);

From 3379cbcaa451905eac32f18d3bb7a8f0d2e74fbd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 13 Aug 2024 11:48:23 +0100
Subject: [PATCH 19/62] pageserver: add CompactKey, use it in InMemoryLayer
 (#8652)

## Problem

This follows a PR that insists all input keys are representable in 16
bytes:
- https://github.com/neondatabase/neon/pull/8648

& a PR that prevents postgres from sending us keys that use the high
bits of field2:
- https://github.com/neondatabase/neon/pull/8657

Motivation for this change:
1. Ingest is bottlenecked on CPU
2. InMemoryLayer can create huge (~1M value) BTreeMap<Key,_> for its
index.
3. Maps over i128 are much faster than maps over an arbitrary 18 byte
struct.

It may still be worthwhile to make the index two-tier to optimize for
the case where only the last 4 bytes (blkno) of the key vary frequently,
but simply using the i128 representation of keys has a big impact for
very little effort.

Related: #8452

## Summary of changes

- Introduce `CompactKey` type which contains an i128
- Use this instead of Key in InMemoryLayer's index, converting back and
forth as needed.

## Performance

All the small-value `bench_ingest` cases show improved throughput.

The one that exercises this index most directly shows a 35% throughput
increase:

```
ingest-small-values/ingest 128MB/100b seq, no delta
                        time:   [374.29 ms 378.56 ms 383.38 ms]
                        thrpt:  [333.88 MiB/s 338.13 MiB/s 341.98 MiB/s]
                 change:
                        time:   [-26.993% -26.117% -25.111%] (p = 0.00 < 0.05)
                        thrpt:  [+33.531% +35.349% +36.974%]
                        Performance has improved.
```
---
 libs/pageserver_api/src/key.rs                | 20 +++++++++++++
 pageserver/benches/bench_ingest.rs            |  2 +-
 .../tenant/storage_layer/inmemory_layer.rs    | 29 +++++++++++--------
 pageserver/src/tenant/timeline.rs             |  2 +-
 4 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 3af3f74e9c..2fdd7de38f 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -22,6 +22,11 @@ pub struct Key {
     pub field6: u32,
 }
 
+/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
+/// a struct of fields.
+#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd)]
+pub struct CompactKey(i128);
+
 /// The storage key size.
 pub const KEY_SIZE: usize = 18;
 
@@ -130,6 +135,14 @@ impl Key {
         }
     }
 
+    pub fn to_compact(&self) -> CompactKey {
+        CompactKey(self.to_i128())
+    }
+
+    pub fn from_compact(k: CompactKey) -> Self {
+        Self::from_i128(k.0)
+    }
+
     pub const fn next(&self) -> Key {
         self.add(1)
     }
@@ -199,6 +212,13 @@ impl fmt::Display for Key {
     }
 }
 
+impl fmt::Display for CompactKey {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let k = Key::from_compact(*self);
+        k.fmt(f)
+    }
+}
+
 impl Key {
     pub const MIN: Key = Key {
         field1: u8::MIN,
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 9bab02e46c..0336302de0 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -95,7 +95,7 @@ async fn ingest(
             }
         }
 
-        layer.put_value(key, lsn, &data, &ctx).await?;
+        layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
     }
     layer.freeze(lsn + 1).await;
 
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 57d93feaaf..fb15ddfba9 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -15,6 +15,7 @@ use crate::tenant::PageReconstructError;
 use crate::{l0_flush, page_cache, walrecord};
 use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
+use pageserver_api::key::CompactKey;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -78,7 +79,7 @@ pub struct InMemoryLayerInner {
     /// All versions of all pages in the layer are kept here. Indexed
     /// by block number and LSN. The value is an offset into the
     /// ephemeral file where the page version is stored.
-    index: BTreeMap<Key, VecMap<Lsn, u64>>,
+    index: BTreeMap<CompactKey, VecMap<Lsn, u64>>,
 
     /// The values are stored in a serialized format in this file.
     /// Each serialized Value is preceded by a 'u32' length field.
@@ -312,8 +313,12 @@ impl InMemoryLayer {
         let reader = inner.file.block_cursor();
 
         for range in keyspace.ranges.iter() {
-            for (key, vec_map) in inner.index.range(range.start..range.end) {
-                let lsn_range = match reconstruct_state.get_cached_lsn(key) {
+            for (key, vec_map) in inner
+                .index
+                .range(range.start.to_compact()..range.end.to_compact())
+            {
+                let key = Key::from_compact(*key);
+                let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
                     Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
                     None => self.start_lsn..end_lsn,
                 };
@@ -324,20 +329,18 @@ impl InMemoryLayer {
                     // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
                     let buf = reader.read_blob(*pos, &ctx).await;
                     if let Err(e) = buf {
-                        reconstruct_state
-                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
+                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
                         break;
                     }
 
                     let value = Value::des(&buf.unwrap());
                     if let Err(e) = value {
-                        reconstruct_state
-                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
+                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
                         break;
                     }
 
                     let key_situation =
-                        reconstruct_state.update_key(key, *entry_lsn, value.unwrap());
+                        reconstruct_state.update_key(&key, *entry_lsn, value.unwrap());
                     if key_situation == ValueReconstructSituation::Complete {
                         break;
                     }
@@ -417,7 +420,7 @@ impl InMemoryLayer {
     /// Adds the page version to the in-memory tree
     pub async fn put_value(
         &self,
-        key: Key,
+        key: CompactKey,
         lsn: Lsn,
         buf: &[u8],
         ctx: &RequestContext,
@@ -430,7 +433,7 @@ impl InMemoryLayer {
     async fn put_value_locked(
         &self,
         locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
-        key: Key,
+        key: CompactKey,
         lsn: Lsn,
         buf: &[u8],
         ctx: &RequestContext,
@@ -539,6 +542,8 @@ impl InMemoryLayer {
         let end_lsn = *self.end_lsn.get().unwrap();
 
         let key_count = if let Some(key_range) = key_range {
+            let key_range = key_range.start.to_compact()..key_range.end.to_compact();
+
             inner
                 .index
                 .iter()
@@ -578,7 +583,7 @@ impl InMemoryLayer {
                         let will_init = Value::des(&buf)?.will_init();
                         let res;
                         (buf, res) = delta_layer_writer
-                            .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
+                            .put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, &ctx)
                             .await;
                         res?;
                     }
@@ -617,7 +622,7 @@ impl InMemoryLayer {
                         let will_init = Value::des(&buf)?.will_init();
                         let res;
                         (buf, res) = delta_layer_writer
-                            .put_value_bytes(*key, *lsn, buf, will_init, ctx)
+                            .put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, ctx)
                             .await;
                         res?;
                     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b003834adf..9bce9c1fac 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5553,7 +5553,7 @@ impl<'a> TimelineWriter<'a> {
 
         let action = self.get_open_layer_action(lsn, buf_size);
         let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
-        let res = layer.put_value(key, lsn, &buf, ctx).await;
+        let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await;
 
         if res.is_ok() {
             // Update the current size only when the entire write was ok.

From b9d2c7bdd555e5c99e1e8ab7f418be6647407a57 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 13 Aug 2024 12:45:54 +0100
Subject: [PATCH 20/62] pageserver: remove vectored get related configs (#8695)

## Problem
Pageserver exposes some vectored get related configs which are not in
use.

## Summary of changes
Remove the following pageserver configs: get_impl, get_vectored_impl,
and `validate_get_vectored`.
They are not used in the pageserver since
https://github.com/neondatabase/neon/pull/8601.
Manual overrides have been removed from the aws repo in
https://github.com/neondatabase/aws/pull/1664.
---
 pageserver/src/bin/pageserver.rs              |  2 -
 pageserver/src/config.rs                      | 58 +------------------
 pageserver/src/tenant/timeline.rs             | 37 +-----------
 .../pagebench/test_large_slru_basebackup.py   |  3 +-
 4 files changed, 4 insertions(+), 96 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 932918410c..da0c11d9bf 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -124,8 +124,6 @@ fn main() -> anyhow::Result<()> {
     // after setting up logging, log the effective IO engine choice and read path implementations
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
-    info!(?conf.get_impl, "starting with get page implementation");
-    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
     info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
 
     let tenants_path = conf.tenants_path();
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index f4c367bd4d..3ac5ac539f 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -29,12 +29,12 @@ use utils::{
     logging::LogFormat,
 };
 
+use crate::l0_flush::L0FlushConfig;
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
-use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
-use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
 
@@ -133,14 +133,8 @@ pub mod defaults {
 
 #virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
 
-#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
-
-#get_impl = '{DEFAULT_GET_IMPL}'
-
 #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
 
-#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
-
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -278,14 +272,8 @@ pub struct PageServerConf {
 
     pub virtual_file_io_engine: virtual_file::IoEngineKind,
 
-    pub get_vectored_impl: GetVectoredImpl,
-
-    pub get_impl: GetImpl,
-
     pub max_vectored_read_bytes: MaxVectoredReadBytes,
 
-    pub validate_vectored_get: bool,
-
     pub image_compression: ImageCompressionAlgorithm,
 
     /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
@@ -396,14 +384,8 @@ struct PageServerConfigBuilder {
 
     virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
 
-    get_vectored_impl: BuilderValue<GetVectoredImpl>,
-
-    get_impl: BuilderValue<GetImpl>,
-
     max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
 
-    validate_vectored_get: BuilderValue<bool>,
-
     image_compression: BuilderValue<ImageCompressionAlgorithm>,
 
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
@@ -493,13 +475,10 @@ impl PageServerConfigBuilder {
 
             virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
 
-            get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
-            get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()),
             max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
             image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
-            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: Set(L0FlushConfig::default()),
             compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
@@ -659,22 +638,10 @@ impl PageServerConfigBuilder {
         self.virtual_file_io_engine = BuilderValue::Set(value);
     }
 
-    pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) {
-        self.get_vectored_impl = BuilderValue::Set(value);
-    }
-
-    pub fn get_impl(&mut self, value: GetImpl) {
-        self.get_impl = BuilderValue::Set(value);
-    }
-
     pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
         self.max_vectored_read_bytes = BuilderValue::Set(value);
     }
 
-    pub fn get_validate_vectored_get(&mut self, value: bool) {
-        self.validate_vectored_get = BuilderValue::Set(value);
-    }
-
     pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
         self.image_compression = BuilderValue::Set(value);
     }
@@ -745,10 +712,7 @@ impl PageServerConfigBuilder {
                 heatmap_upload_concurrency,
                 secondary_download_concurrency,
                 ingest_batch_size,
-                get_vectored_impl,
-                get_impl,
                 max_vectored_read_bytes,
-                validate_vectored_get,
                 image_compression,
                 ephemeral_bytes_per_memory_kb,
                 l0_flush,
@@ -1002,21 +966,12 @@ impl PageServerConf {
                 "virtual_file_io_engine" => {
                     builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
                 }
-                "get_vectored_impl" => {
-                    builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
-                }
-                "get_impl" => {
-                    builder.get_impl(parse_toml_from_str("get_impl", item)?)
-                }
                 "max_vectored_read_bytes" => {
                     let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
                     builder.get_max_vectored_read_bytes(
                         MaxVectoredReadBytes(
                             NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
                 }
-                "validate_vectored_get" => {
-                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
-                }
                 "image_compression" => {
                     builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
                 }
@@ -1106,14 +1061,11 @@ impl PageServerConf {
             secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
             ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
             virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
-            get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-            get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
             max_vectored_read_bytes: MaxVectoredReadBytes(
                 NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                     .expect("Invalid default constant"),
             ),
             image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
-            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
             l0_flush: L0FlushConfig::default(),
             compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
@@ -1349,13 +1301,10 @@ background_task_maximum_delay = '334 s'
                 secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
-                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                 max_vectored_read_bytes: MaxVectoredReadBytes(
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
                 ),
-                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                 image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
@@ -1425,13 +1374,10 @@ background_task_maximum_delay = '334 s'
                 secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: 100,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
-                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                 max_vectored_read_bytes: MaxVectoredReadBytes(
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
                 ),
-                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                 image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 9bce9c1fac..abe3f56e45 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -802,40 +802,6 @@ impl From<GetReadyAncestorError> for PageReconstructError {
     }
 }
 
-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum GetVectoredImpl {
-    Sequential,
-    Vectored,
-}
-
-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum GetImpl {
-    Legacy,
-    Vectored,
-}
-
 pub(crate) enum WaitLsnWaiter<'a> {
     Timeline(&'a Timeline),
     Tenant,
@@ -995,11 +961,10 @@ impl Timeline {
         }
 
         trace!(
-            "get vectored request for {:?}@{} from task kind {:?} will use {} implementation",
+            "get vectored request for {:?}@{} from task kind {:?}",
             keyspace,
             lsn,
             ctx.task_kind(),
-            self.conf.get_vectored_impl
         );
 
         let start = crate::metrics::GET_VECTORED_LATENCY
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index 3258d4dcfa..8b934057e4 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -44,8 +44,7 @@ def test_basebackup_with_high_slru_count(
     page_cache_size = 16384
     max_file_descriptors = 500000
     neon_env_builder.pageserver_config_override = (
-        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; "
-        f"get_vectored_impl='vectored'; validate_vectored_get=false"
+        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}"
     )
     params.update(
         {

From afb68b0e7eda8c86ca0d2994c2499f25a46655a0 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 13 Aug 2024 15:07:24 +0300
Subject: [PATCH 21/62] Report search_path to make it possible to use it in
 pgbouncer track_extra_parameters (#8303)

## Problem

When pooled connections are used, session semantic its not preserved,
including GUC settings.
Many customers have particular problem with setting search_path.
But pgbouncer 1.20 has `track_extra_parameters` settings which allows to
track parameters included in startup package which are reported by
Postgres. Postgres has [an official list of parameters that it reports
to the
client](https://www.postgresql.org/docs/15/protocol-flow.html#PROTOCOL-ASYNC).
This PR makes Postgres also report `search_path` and so allows to
include it in `track_extra_parameters`.


## Summary of changes

Set GUC_REPORT flag  for `search_path`.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/neon.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 6afca61fae..784d0f1da3 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -32,6 +32,7 @@
 #include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
+#include "utils/guc_tables.h"
 #include "utils/wait_event.h"
 
 #include "extension_server.h"
@@ -584,6 +585,40 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 	return false;
 }
 
+
+/*
+ * pgbouncer is able to track GUCs reported by Postgres.
+ * But most parameters cannot be tracked this way. The only parameters that can be tracked are ones
+ * that Postgres reports to the client. Unfortunately `search_path` is not reported by Postgres:
+ * https://www.postgresql.org/message-id/flat/CAGECzQQ6xFcgrg%2Be0p9mCumtK362TiA6vTiiZKoYbS8OXggwuQ%40mail.gmail.com#be4bfd7a9cf1f0633bdb2d1790a0a1be
+ * This code sets GUC_REPORT flag for `search_path`making it possible to include it in
+ * pgbouncer's `track_extra_parameters` list.
+ *
+ * This code is inspired by how the Citus extension does this, see
+ * https://github.com/citusdata/citus/blob/2a263fe69a707d16ef24378f7650742386b0968f/src/backend/distributed/shared_library_init.c#L2694
+ */
+static void
+ReportSearchPath(void)
+{
+#if PG_VERSION_NUM >= 160000
+	int nGucs = 0;
+	struct config_generic **gucs = get_guc_variables(&nGucs);
+#else
+	struct config_generic **gucs = get_guc_variables();
+	int nGucs = GetNumConfigOptions();
+#endif
+
+	for (int i = 0; i < nGucs; i++)
+	{
+		struct config_generic *guc = (struct config_generic *) gucs[i];
+
+		if (strcmp(guc->name, "search_path") == 0)
+		{
+			guc->flags |= GUC_REPORT;
+		}
+	}
+}
+
 void
 _PG_init(void)
 {
@@ -627,6 +662,8 @@ _PG_init(void)
 	 * extension was loaded will be removed.
 	 */
 	EmitWarningsOnPlaceholders("neon");
+
+	ReportSearchPath();
 }
 
 PG_FUNCTION_INFO_V1(pg_cluster_size);

From ecb01834d645392d508a50d40fa75e746aeef276 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 13 Aug 2024 15:15:55 +0100
Subject: [PATCH 22/62] pageserver: implement utilization score (#8703)

## Problem

When the utilization API was added, it was just a stub with disk space
information.

Disk space information isn't a very good metric for assigning tenants to
pageservers, because pageservers making full use of their disks would
always just have 85% utilization, irrespective of how much pressure they
had for disk space.

## Summary of changes

- Use the new layer visibiilty metric to calculate a "wanted size" per
tenant, and sum these to get a total local disk space wanted per
pageserver. This acts as the primary signal for utilization.
- Also use the shard count to calculate a utilization score, and take
the max of this and the disk-driven utilization. The shard count limit
is currently set as a constant 20,000, which matches contemporary
operational practices when loading pageservers.

The shard count limit means that for tiny/empty tenants, on a machine
with 3.84TB disk, each tiny tenant influences the utilization score as
if it had size 160MB.
---
 libs/pageserver_api/src/models/utilization.rs | 90 +++++++++++++++++--
 pageserver/src/http/routes.rs                 |  5 +-
 pageserver/src/tenant.rs                      | 13 +++
 pageserver/src/tenant/mgr.rs                  | 51 +++++++++++
 pageserver/src/utilization.rs                 | 34 +++++--
 5 files changed, 176 insertions(+), 17 deletions(-)

diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs
index e88cab5d6a..0fec221276 100644
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -1,4 +1,5 @@
-use utils::serde_system_time::SystemTime;
+use std::time::SystemTime;
+use utils::{serde_percent::Percent, serde_system_time};
 
 /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
 /// the next tenant.
@@ -9,19 +10,88 @@ use utils::serde_system_time::SystemTime;
 /// not handle full u64 values properly.
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
 pub struct PageserverUtilization {
-    /// Used disk space
+    /// Used disk space (physical, ground truth from statfs())
     #[serde(serialize_with = "ser_saturating_u63")]
     pub disk_usage_bytes: u64,
     /// Free disk space
     #[serde(serialize_with = "ser_saturating_u63")]
     pub free_space_bytes: u64,
-    /// Lower is better score for how good candidate for a next tenant would this pageserver be.
-    #[serde(serialize_with = "ser_saturating_u63")]
+
+    /// Wanted disk space, based on the tenant shards currently present on this pageserver: this
+    /// is like disk_usage_bytes, but it is stable and does not change with the cache state of
+    /// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay
+    /// there, or may be unrealistically low if the pageserver has attached tenants which haven't
+    /// downloaded layers yet.
+    #[serde(serialize_with = "ser_saturating_u63", default)]
+    pub disk_wanted_bytes: u64,
+
+    // What proportion of total disk space will this pageserver use before it starts evicting data?
+    #[serde(default = "unity_percent")]
+    pub disk_usable_pct: Percent,
+
+    // How many shards are currently on this node?
+    #[serde(default)]
+    pub shard_count: u32,
+
+    // How many shards should this node be able to handle at most?
+    #[serde(default)]
+    pub max_shard_count: u32,
+
+    /// Cached result of [`Self::score`]
     pub utilization_score: u64,
+
     /// When was this snapshot captured, pageserver local time.
     ///
     /// Use millis to give confidence that the value is regenerated often enough.
-    pub captured_at: SystemTime,
+    pub captured_at: serde_system_time::SystemTime,
+}
+
+fn unity_percent() -> Percent {
+    Percent::new(0).unwrap()
+}
+
+impl PageserverUtilization {
+    const UTILIZATION_FULL: u64 = 1000000;
+
+    /// Calculate a utilization score.  The result is to be inrepreted as a fraction of
+    /// Self::UTILIZATION_FULL.
+    ///
+    /// Lower values are more affine to scheduling more work on this node.
+    /// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work.
+    /// - 0.0 represents an empty node.
+    /// - Negative values are forbidden
+    /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
+    ///   layer eviction.
+    pub fn score(&self) -> u64 {
+        let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
+            * self.disk_usable_pct.get() as u64)
+            / 100;
+        let disk_utilization_score =
+            self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity;
+
+        let shard_utilization_score =
+            self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64;
+        std::cmp::max(disk_utilization_score, shard_utilization_score)
+    }
+
+    pub fn refresh_score(&mut self) {
+        self.utilization_score = self.score();
+    }
+
+    /// A utilization structure that has a full utilization score: use this as a placeholder when
+    /// you need a utilization but don't have real values yet.
+    pub fn full() -> Self {
+        Self {
+            disk_usage_bytes: 1,
+            free_space_bytes: 0,
+            disk_wanted_bytes: 1,
+            disk_usable_pct: Percent::new(100).unwrap(),
+            shard_count: 1,
+            max_shard_count: 1,
+            utilization_score: Self::UTILIZATION_FULL,
+            captured_at: serde_system_time::SystemTime(SystemTime::now()),
+        }
+    }
 }
 
 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
@@ -49,15 +119,19 @@ mod tests {
         let doc = PageserverUtilization {
             disk_usage_bytes: u64::MAX,
             free_space_bytes: 0,
-            utilization_score: u64::MAX,
-            captured_at: SystemTime(
+            disk_wanted_bytes: u64::MAX,
+            utilization_score: 13,
+            disk_usable_pct: Percent::new(90).unwrap(),
+            shard_count: 100,
+            max_shard_count: 200,
+            captured_at: serde_system_time::SystemTime(
                 std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
             ),
         };
 
         let s = serde_json::to_string(&doc).unwrap();
 
-        let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;
+        let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}";
 
         assert_eq!(s, expected);
     }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a983d8c4c2..2b0156079e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2357,8 +2357,9 @@ async fn get_utilization(
     // regenerate at most 1Hz to allow polling at any rate.
     if !still_valid {
         let path = state.conf.tenants_path();
-        let doc = crate::utilization::regenerate(path.as_std_path())
-            .map_err(ApiError::InternalServerError)?;
+        let doc =
+            crate::utilization::regenerate(state.conf, path.as_std_path(), &state.tenant_manager)
+                .map_err(ApiError::InternalServerError)?;
 
         let mut buf = Vec::new();
         serde_json::to_writer(&mut buf, &doc)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index cfdb32f755..a238004aad 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3732,6 +3732,19 @@ impl Tenant {
     pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
         self.tenant_conf.load().tenant_conf.clone()
     }
+
+    /// How much local storage would this tenant like to have?  It can cope with
+    /// less than this (via eviction and on-demand downloads), but this function enables
+    /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
+    /// by keeping important things on local disk.
+    pub(crate) fn local_storage_wanted(&self) -> u64 {
+        let mut wanted = 0;
+        let timelines = self.timelines.lock().unwrap();
+        for timeline in timelines.values() {
+            wanted += timeline.metrics.visible_physical_size_gauge.get();
+        }
+        wanted
+    }
 }
 
 /// Create the cluster temporarily in 'initdbpath' directory inside the repository
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 3316627540..c8a11e88cc 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2085,6 +2085,57 @@ impl TenantManager {
             }
         }
     }
+
+    /// Calculate the tenant shards' contributions to this pageserver's utilization metrics.  The
+    /// returned values are:
+    ///  - the number of bytes of local disk space this pageserver's shards are requesting, i.e.
+    ///    how much space they would use if not impacted by disk usage eviction.
+    ///  - the number of tenant shards currently on this pageserver, including attached
+    ///    and secondary.
+    ///
+    /// This function is quite expensive: callers are expected to cache the result and
+    /// limit how often they call it.
+    pub(crate) fn calculate_utilization(&self) -> Result<(u64, u32), TenantMapListError> {
+        let tenants = self.tenants.read().unwrap();
+        let m = match &*tenants {
+            TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
+        };
+        let shard_count = m.len();
+        let mut wanted_bytes = 0;
+
+        for tenant_slot in m.values() {
+            match tenant_slot {
+                TenantSlot::InProgress(_barrier) => {
+                    // While a slot is being changed, we can't know how much storage it wants.  This
+                    // means this function's output can fluctuate if a lot of changes are going on
+                    // (such as transitions from secondary to attached).
+                    //
+                    // We could wait for the barrier and retry, but it's important that the utilization
+                    // API is responsive, and the data quality impact is not very significant.
+                    continue;
+                }
+                TenantSlot::Attached(tenant) => {
+                    wanted_bytes += tenant.local_storage_wanted();
+                }
+                TenantSlot::Secondary(secondary) => {
+                    let progress = secondary.progress.lock().unwrap();
+                    wanted_bytes += if progress.heatmap_mtime.is_some() {
+                        // If we have heatmap info, then we will 'want' the sum
+                        // of the size of layers in the heatmap: this is how much space
+                        // we would use if not doing any eviction.
+                        progress.bytes_total
+                    } else {
+                        // In the absence of heatmap info, assume that the secondary location simply
+                        // needs as much space as it is currently using.
+                        secondary.resident_size_metric.get()
+                    }
+                }
+            }
+        }
+
+        Ok((wanted_bytes, shard_count as u32))
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs
index e6c835aa75..3c48c84598 100644
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -5,12 +5,17 @@
 
 use anyhow::Context;
 use std::path::Path;
+use utils::serde_percent::Percent;
 
 use pageserver_api::models::PageserverUtilization;
 
-pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtilization> {
-    // TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough
+use crate::{config::PageServerConf, tenant::mgr::TenantManager};
 
+pub(crate) fn regenerate(
+    conf: &PageServerConf,
+    tenants_path: &Path,
+    tenant_manager: &TenantManager,
+) -> anyhow::Result<PageserverUtilization> {
     let statvfs = nix::sys::statvfs::statvfs(tenants_path)
         .map_err(std::io::Error::from)
         .context("statvfs tenants directory")?;
@@ -34,16 +39,31 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtiliz
 
     let captured_at = std::time::SystemTime::now();
 
-    let doc = PageserverUtilization {
+    // Calculate aggregate utilization from tenants on this pageserver
+    let (disk_wanted_bytes, shard_count) = tenant_manager.calculate_utilization()?;
+
+    // Fetch the fraction of disk space which may be used
+    let disk_usable_pct = match conf.disk_usage_based_eviction.clone() {
+        Some(e) => e.max_usage_pct,
+        None => Percent::new(100).unwrap(),
+    };
+
+    // Express a static value for how many shards we may schedule on one node
+    const MAX_SHARDS: u32 = 20000;
+
+    let mut doc = PageserverUtilization {
         disk_usage_bytes: used,
         free_space_bytes: free,
-        // lower is better; start with a constant
-        //
-        // note that u64::MAX will be output as i64::MAX as u64, but that should not matter
-        utilization_score: u64::MAX,
+        disk_wanted_bytes,
+        disk_usable_pct,
+        shard_count,
+        max_shard_count: MAX_SHARDS,
+        utilization_score: 0,
         captured_at: utils::serde_system_time::SystemTime(captured_at),
     };
 
+    doc.refresh_score();
+
     // TODO: make utilization_score into a metric
 
     Ok(doc)

From 852a6a7a5aab76b29f54ae41032fdf46d7826903 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 13 Aug 2024 15:28:26 +0100
Subject: [PATCH 23/62] CI: mark PRs and issues create by external users
 (#8694)

## Problem

We want to mark new PRs and issues created by external users

## Summary of changes
- Add a new workflow which adds `external` label for issues and PRs
created by external users
---
 .../workflows/label-for-external-users.yml    | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 .github/workflows/label-for-external-users.yml

diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml
new file mode 100644
index 0000000000..2f19a746e0
--- /dev/null
+++ b/.github/workflows/label-for-external-users.yml
@@ -0,0 +1,35 @@
+name: Add `external` label to issues and PRs created by external users
+
+on:
+  issues:
+    types:
+      - opened
+  pull_request:
+    types:
+      - opened
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+env:
+  LABEL: external
+
+jobs:
+  add-label:
+    # This workflow uses `author_association` for PRs and issues to determine if the user is an external user.
+    # Possible values for `author_association`: https://docs.github.com/en/graphql/reference/enums#commentauthorassociation
+    if: ${{ !contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].author_association) }}
+
+    runs-on: ubuntu-22.04
+    permissions:
+      pull-requests: write
+      issues: write
+
+    steps:
+    - name: Label new ${{ github.event_name }}
+      env:
+        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].number }}
+        GH_CLI_COMMAND: ${{ github.event_name == 'pull_request' && 'pr' || 'issue' }}
+      run: |
+        gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER}

From e0946e334a7a2812e157411eba243d66cfb43394 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 13 Aug 2024 19:07:51 +0300
Subject: [PATCH 24/62] bench: stop immediatedly in some benches (#8713)

It seems that some benchmarks are failing because they are simply not
stopping to ingest wal on shutdown. It might mean that the tests were
never ran on a stable pageserver situation and WAL has always been left
to be ingested on safekeepers, but let's see if this silences the
failures and "stops the bleeding".

Cc: https://github.com/neondatabase/neon/issues/8712
---
 .../pageserver/pagebench/test_ondemand_download_churn.py       | 3 +++
 test_runner/performance/test_layer_map.py                      | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
index 644c1f559b..0348b08f04 100644
--- a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
+++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
@@ -62,6 +62,9 @@ def test_download_churn(
 
     run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration)
 
+    # see https://github.com/neondatabase/neon/issues/8712
+    env.stop(immediate=True)
+
 
 def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     remote_storage_kind = s3_storage()
diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 9b20954d45..890b70b9fc 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -36,3 +36,6 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
     with zenbenchmark.record_duration("test_query"):
         cur.execute("SELECT count(*) from t")
         assert cur.fetchone() == (n_iters * n_records,)
+
+    # see https://github.com/neondatabase/neon/issues/8712
+    env.stop(immediate=True)

From 8f170c51059087b26ac098219ca97072f053f513 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 13 Aug 2024 20:00:54 +0300
Subject: [PATCH 25/62] fix: make compaction more sensitive to cancellation
 (#8706)

A few of the benchmarks have started failing after #8655 where they are
waiting for compactor task. Reads done by image layer creation should
already be cancellation sensitive because vectored get does a check each
time, but try sprinkling additional cancellation points to:

- each partition
- after each vectored read batch
---
 pageserver/src/tenant/timeline.rs              | 11 +++++++++++
 pageserver/src/tenant/timeline/compaction.rs   |  8 ++++++++
 test_runner/fixtures/neon_fixtures.py          |  2 ++
 test_runner/regress/test_pageserver_restart.py | 10 ++++++++++
 4 files changed, 31 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index abe3f56e45..767f5969fc 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3917,6 +3917,10 @@ impl Timeline {
                         .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
                         .await?;
 
+                    if self.cancel.is_cancelled() {
+                        return Err(CreateImageLayersError::Cancelled);
+                    }
+
                     for (img_key, img) in results {
                         let img = match img {
                             Ok(img) => img,
@@ -4024,6 +4028,9 @@ impl Timeline {
                 next_start_key: img_range.end,
             });
         }
+        if self.cancel.is_cancelled() {
+            return Err(CreateImageLayersError::Cancelled);
+        }
         let mut wrote_any_image = false;
         for (k, v) in data {
             if v.is_empty() {
@@ -4138,6 +4145,10 @@ impl Timeline {
         let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
 
         for partition in partitioning.parts.iter() {
+            if self.cancel.is_cancelled() {
+                return Err(CreateImageLayersError::Cancelled);
+            }
+
             let img_range = start..partition.ranges.last().unwrap().end;
             let compact_metadata = partition.overlaps(&Key::metadata_key_range());
             if compact_metadata {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8390cb839c..9ac0086cde 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -748,6 +748,9 @@ impl Timeline {
         let all_keys = {
             let mut all_keys = Vec::new();
             for l in deltas_to_compact.iter() {
+                if self.cancel.is_cancelled() {
+                    return Err(CompactionError::ShuttingDown);
+                }
                 all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
             }
             // The current stdlib sorting implementation is designed in a way where it is
@@ -830,6 +833,11 @@ impl Timeline {
         };
         stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
         drop_rlock(guard);
+
+        if self.cancel.is_cancelled() {
+            return Err(CompactionError::ShuttingDown);
+        }
+
         stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
 
         // This iterator walks through all key-value pairs from all the layers
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 561e8bce04..daa4c8b97f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1251,6 +1251,8 @@ class NeonEnv:
     def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True):
         """
         After this method returns, there should be no child processes running.
+
+        Unless of course, some stopping failed, in that case, all remaining child processes are leaked.
         """
         self.endpoints.stop_all(fail_on_endpoint_errors)
 
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 68a45f957c..bbf82fea4c 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -159,6 +159,8 @@ def test_pageserver_chaos(
     if build_type == "debug":
         pytest.skip("times out in debug builds")
 
+    # same rationale as with the immediate stop; we might leave orphan layers behind.
+    neon_env_builder.disable_scrub_on_exit()
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
@@ -220,3 +222,11 @@ def test_pageserver_chaos(
         # Check that all the updates are visible
         num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0]
         assert num_updates == i * 100000
+
+    # currently pageserver cannot tolerate the fact that "s3" goes away, and if
+    # we succeeded in a compaction before shutdown, there might be a lot of
+    # uploads pending, certainly more than what we can ingest with MOCK_S3
+    #
+    # so instead, do a fast shutdown for this one test.
+    # See https://github.com/neondatabase/neon/issues/8709
+    env.stop(immediate=True)

From ae6e27274cecaf60b0a1388cb9344bc8987a4d3f Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 13 Aug 2024 20:14:42 +0300
Subject: [PATCH 26/62] refactor(test): unify how we clear shared buffers
 (#8634)

so that we can easily plug in LFC clearing as well.

Private discussion reference:
<https://neondb.slack.com/archives/C033A2WE6BZ/p1722942856987979>
---
 test_runner/fixtures/neon_fixtures.py                | 11 +++++++++++
 test_runner/fixtures/workload.py                     | 12 +++---------
 test_runner/regress/test_hot_standby.py              |  3 ++-
 test_runner/regress/test_oid_overflow.py             |  2 +-
 test_runner/regress/test_read_validation.py          |  2 +-
 test_runner/regress/test_timeline_detach_ancestor.py |  2 +-
 test_runner/regress/test_vm_bits.py                  |  2 +-
 test_runner/regress/test_wal_acceptor.py             |  2 +-
 8 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index daa4c8b97f..6600b44759 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4100,6 +4100,17 @@ class Endpoint(PgProtocol, LogUtils):
         assert self.pgdata_dir is not None  # please mypy
         return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024
 
+    def clear_shared_buffers(self, cursor: Optional[Any] = None):
+        """
+        Best-effort way to clear postgres buffers. Pinned buffers will not be 'cleared.'
+
+        Might also clear LFC.
+        """
+        if cursor is not None:
+            cursor.execute("select clear_buffer_cache()")
+        else:
+            self.safe_psql("select clear_buffer_cache()")
+
 
 class EndpointFactory:
     """An object representing multiple compute endpoints."""
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index dfd9caba3e..cc93762175 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -182,14 +182,8 @@ class Workload:
 
     def validate(self, pageserver_id: Optional[int] = None):
         endpoint = self.endpoint(pageserver_id)
-        result = endpoint.safe_psql_many(
-            [
-                "select clear_buffer_cache()",
-                f"""
-            SELECT COUNT(*) FROM {self.table}
-            """,
-            ]
-        )
+        endpoint.clear_shared_buffers()
+        result = endpoint.safe_psql(f"SELECT COUNT(*) FROM {self.table}")
 
         log.info(f"validate({self.expect_rows}): {result}")
-        assert result == [[("",)], [(self.expect_rows,)]]
+        assert result == [(self.expect_rows,)]
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 8edc8c554c..ae63136abb 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -168,7 +168,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
             # re-execute the query, it will make GetPage
             # requests. This does not clear the last-written LSN cache
             # so we still remember the LSNs of the pages.
-            s_cur.execute("SELECT clear_buffer_cache()")
+            secondary.clear_shared_buffers(cursor=s_cur)
 
             if pause_apply:
                 s_cur.execute("SELECT pg_wal_replay_pause()")
@@ -332,6 +332,7 @@ def test_replica_query_race(neon_simple_env: NeonEnv):
                 log.info(f"read {reads}: counter {readcounter}, last update {writecounter}")
             reads += 1
 
+            # FIXME: what about LFC clearing?
             await conn.execute("SELECT clear_buffer_cache()")
 
     async def both():
diff --git a/test_runner/regress/test_oid_overflow.py b/test_runner/regress/test_oid_overflow.py
index a94ae99ed9..e8eefc2414 100644
--- a/test_runner/regress/test_oid_overflow.py
+++ b/test_runner/regress/test_oid_overflow.py
@@ -37,7 +37,7 @@ def test_oid_overflow(neon_env_builder: NeonEnvBuilder):
     oid = cur.fetchall()[0][0]
     log.info(f"t2.relfilenode={oid}")
 
-    cur.execute("SELECT clear_buffer_cache()")
+    endpoint.clear_shared_buffers(cursor=cur)
 
     cur.execute("SELECT x from t1")
     assert cur.fetchone() == (1,)
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index 2437c8f806..d128c60a99 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -61,7 +61,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
 
             log.info("Clear buffer cache to ensure no stale pages are brought into the cache")
 
-            c.execute("select clear_buffer_cache()")
+            endpoint.clear_shared_buffers(cursor=c)
 
             cache_entries = query_scalar(
                 c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}"
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index b3767a2766..4e409eeb17 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -411,7 +411,7 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn
 
     assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None
 
-    assert ep.safe_psql("SELECT clear_buffer_cache();")
+    ep.clear_shared_buffers()
     assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
     assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0
     ep.stop()
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 225b952e73..7272979c4a 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -62,7 +62,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
 
     # Clear the buffer cache, to force the VM page to be re-fetched from
     # the page server
-    cur.execute("SELECT clear_buffer_cache()")
+    endpoint.clear_shared_buffers(cursor=cur)
 
     # Check that an index-only scan doesn't see the deleted row. If the
     # clearing of the VM bit was not replayed correctly, this would incorrectly
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index f02f19c588..bf7829fc84 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2159,7 +2159,7 @@ def test_broker_discovery(neon_env_builder: NeonEnvBuilder):
         # generate some data to commit WAL on safekeepers
         endpoint.safe_psql("insert into t select generate_series(1,100), 'action'")
         # clear the buffers
-        endpoint.safe_psql("select clear_buffer_cache()")
+        endpoint.clear_shared_buffers()
         # read data to fetch pages from pageserver
         endpoint.safe_psql("select sum(i) from t")
 

From 9d2276323d8f375c7394a92f71fd8c728d61ab96 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Tue, 13 Aug 2024 19:36:39 +0200
Subject: [PATCH 27/62] Benchmarking tests: automatically restore Neon reuse
 databases, too and migrate to pg16 (#8707)

## Problem

We use a set of **Neon** reuse databases in benchmarking.yml which are
still using pg14.
Because we want to compare apples to apples and have migrated the AWS
reuse clusters to pg16 we should also use pg16 for Neon.

## Summary of changes

- Automatically restore the test databases for Neon project
---
 .github/workflows/_benchmarking_preparation.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml
index 0f540afab7..7229776cd6 100644
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -13,7 +13,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres ] 
+        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] 
         database: [ clickbench, tpch, userexample ]
   
     env:
@@ -31,6 +31,9 @@ jobs:
       id: set-up-prep-connstr
       run: |
         case "${PLATFORM}" in
+          neon)
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} 
+            ;;
           aws-rds-postgres)
             CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} 
             ;;

From 87a5d7db9e0f47eab8b48cebc6ab47045a8c9b1b Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 13 Aug 2024 20:49:50 +0300
Subject: [PATCH 28/62] test: do better job of shutting everything down (#8714)

After #8655 we've had a few issues (mostly tracked on #8708) with the
graceful shutdown. In order to shutdown more of the processes and catch
more errors, for example, from all pageservers, do an immediate shutdown
for those nodes which fail the initial (possibly graceful) shutdown.

Cc: #6485
---
 test_runner/fixtures/neon_fixtures.py | 40 +++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 6600b44759..961dbde95c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1254,20 +1254,54 @@ class NeonEnv:
 
         Unless of course, some stopping failed, in that case, all remaining child processes are leaked.
         """
-        self.endpoints.stop_all(fail_on_endpoint_errors)
+
+        # the commonly failing components have special try-except behavior,
+        # trying to get us to actually shutdown all processes over easier error
+        # reporting.
+
+        raise_later = None
+        try:
+            self.endpoints.stop_all(fail_on_endpoint_errors)
+        except Exception as e:
+            raise_later = e
 
         # Stop storage controller before pageservers: we don't want it to spuriously
         # detect a pageserver "failure" during test teardown
         self.storage_controller.stop(immediate=immediate)
 
+        stop_later = []
+        metric_errors = []
+
         for sk in self.safekeepers:
             sk.stop(immediate=immediate)
         for pageserver in self.pageservers:
             if ps_assert_metric_no_errors:
-                pageserver.assert_no_metric_errors()
-            pageserver.stop(immediate=immediate)
+                try:
+                    pageserver.assert_no_metric_errors()
+                except Exception as e:
+                    metric_errors.append(e)
+                    log.error(f"metric validation failed on {pageserver.id}: {e}")
+            try:
+                pageserver.stop(immediate=immediate)
+            except RuntimeError:
+                stop_later.append(pageserver)
         self.broker.stop(immediate=immediate)
 
+        # TODO: for nice logging we need python 3.11 ExceptionGroup
+        for ps in stop_later:
+            ps.stop(immediate=True)
+
+        if raise_later is not None:
+            raise raise_later
+
+        for error in metric_errors:
+            raise error
+
+        if len(stop_later) > 0:
+            raise RuntimeError(
+                f"{len(stop_later)} out of {len(self.pageservers)} pageservers failed to stop gracefully"
+            )
+
     @property
     def pageserver(self) -> NeonPageserver:
         """

From 6d6e2c6a395d365852df478b37bbf50af1e48e6b Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 13 Aug 2024 20:51:51 +0300
Subject: [PATCH 29/62] feat(detach_ancestor): better retries with persistent
 gc blocking (#8430)

With the persistent gc blocking, we can now retry reparenting timelines
which had failed for whatever reason on the previous attempt(s).
Restructure the detach_ancestor into three phases:

- prepare (insert persistent gc blocking, copy lsn prefix, layers)
- detach and reparent
    - reparenting can fail, so we might need to retry this portion
- complete (remove persistent gc blocking)

Cc: #6994
---
 libs/utils/src/completion.rs                  |  31 +-
 pageserver/src/http/routes.rs                 |   4 +-
 pageserver/src/tenant.rs                      |  12 +-
 pageserver/src/tenant/metadata.rs             |   3 +
 pageserver/src/tenant/mgr.rs                  | 119 +++-
 .../src/tenant/remote_timeline_client.rs      |  70 ++-
 .../tenant/remote_timeline_client/index.rs    |  57 +-
 pageserver/src/tenant/tasks.rs                |  14 +-
 pageserver/src/tenant/timeline.rs             |  30 +-
 .../src/tenant/timeline/detach_ancestor.rs    | 528 ++++++++++++++----
 storage_controller/src/service.rs             |   4 +-
 .../regress/test_timeline_detach_ancestor.py  | 327 ++++++++++-
 12 files changed, 960 insertions(+), 239 deletions(-)

diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs
index 2fef8d35df..f65c080ad4 100644
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -5,13 +5,40 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
 pub struct Completion {
-    _token: TaskTrackerToken,
+    token: TaskTrackerToken,
+}
+
+impl std::fmt::Debug for Completion {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Completion")
+            .field("siblings", &self.token.task_tracker().len())
+            .finish()
+    }
+}
+
+impl Completion {
+    /// Returns true if this completion is associated with the given barrier.
+    pub fn blocks(&self, barrier: &Barrier) -> bool {
+        TaskTracker::ptr_eq(self.token.task_tracker(), &barrier.0)
+    }
+
+    pub fn barrier(&self) -> Barrier {
+        Barrier(self.token.task_tracker().clone())
+    }
 }
 
 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
 pub struct Barrier(TaskTracker);
 
+impl std::fmt::Debug for Barrier {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Barrier")
+            .field("remaining", &self.0.len())
+            .finish()
+    }
+}
+
 impl Default for Barrier {
     fn default() -> Self {
         let (_, rx) = channel();
@@ -51,5 +78,5 @@ pub fn channel() -> (Completion, Barrier) {
     tracker.close();
 
     let token = tracker.token();
-    (Completion { _token: token }, Barrier(tracker))
+    (Completion { token }, Barrier(tracker))
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 2b0156079e..6f7480cc6c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1887,7 +1887,7 @@ async fn timeline_detach_ancestor_handler(
         // drop(tenant);
 
         let resp = match progress {
-            detach_ancestor::Progress::Prepared(_guard, prepared) => {
+            detach_ancestor::Progress::Prepared(attempt, prepared) => {
                 // it would be great to tag the guard on to the tenant activation future
                 let reparented_timelines = state
                     .tenant_manager
@@ -1895,10 +1895,10 @@ async fn timeline_detach_ancestor_handler(
                         tenant_shard_id,
                         timeline_id,
                         prepared,
+                        attempt,
                         ctx,
                     )
                     .await
-                    .context("timeline detach ancestor completion")
                     .map_err(ApiError::InternalServerError)?;
 
                 AncestorDetached {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a238004aad..b065f58382 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -302,7 +302,11 @@ pub struct Tenant {
     pub(crate) timeline_get_throttle:
         Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
 
-    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
+    /// An ongoing timeline detach concurrency limiter.
+    ///
+    /// As a tenant will likely be restarted as part of timeline detach ancestor it makes no sense
+    /// to have two running at the same time. A different one can be started if an earlier one
+    /// has failed for whatever reason.
     ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
 
     /// `index_part.json` based gc blocking reason tracking.
@@ -833,9 +837,9 @@ impl Tenant {
                             // The Stopping case is for when we have passed control on to DeleteTenantFlow:
                             // if it errors, we will call make_broken when tenant is already in Stopping.
                             assert!(
-                            matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
-                            "the attach task owns the tenant state until activation is complete"
-                        );
+                                matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
+                                "the attach task owns the tenant state until activation is complete"
+                            );
 
                             *state = TenantState::broken_from_reason(err.to_string());
                         });
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index bbc070a81b..6073abc8c3 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -285,12 +285,15 @@ impl TimelineMetadata {
     }
 
     /// When reparenting, the `ancestor_lsn` does not change.
+    ///
+    /// Returns true if anything was changed.
     pub fn reparent(&mut self, timeline: &TimelineId) {
         assert!(self.body.ancestor_timeline.is_some());
         // no assertion for redoing this: it's fine, we may have to repeat this multiple times over
         self.body.ancestor_timeline = Some(*timeline);
     }
 
+    /// Returns true if anything was changed
     pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) {
         if let Some(ancestor) = self.body.ancestor_timeline {
             assert_eq!(ancestor, branchpoint.0);
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index c8a11e88cc..5f2539d426 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -54,7 +54,7 @@ use utils::id::{TenantId, TimelineId};
 
 use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
-use super::timeline::detach_ancestor::PreparedTimelineDetach;
+use super::timeline::detach_ancestor::{self, PreparedTimelineDetach};
 use super::{GlobalShutDown, TenantSharedResources};
 
 /// For a tenant that appears in TenantsMap, it may either be
@@ -1927,8 +1927,10 @@ impl TenantManager {
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         prepared: PreparedTimelineDetach,
+        mut attempt: detach_ancestor::Attempt,
         ctx: &RequestContext,
     ) -> Result<HashSet<TimelineId>, anyhow::Error> {
+        use crate::tenant::timeline::detach_ancestor::Error;
         // FIXME: this is unnecessary, slotguard already has these semantics
         struct RevertOnDropSlot(Option<SlotGuard>);
 
@@ -1977,43 +1979,98 @@ impl TenantManager {
 
         let timeline = tenant.get_timeline(timeline_id, true)?;
 
-        let reparented = timeline
-            .complete_detaching_timeline_ancestor(&tenant, prepared, ctx)
+        let resp = timeline
+            .detach_from_ancestor_and_reparent(&tenant, prepared, ctx)
             .await?;
 
         let mut slot_guard = slot_guard.into_inner();
 
-        let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, ShutdownMode::Hard).await {
-            Ok(()) => {
-                slot_guard.drop_old_value()?;
+        let tenant = if resp.reset_tenant_required() {
+            attempt.before_reset_tenant();
+
+            let (_guard, progress) = utils::completion::channel();
+            match tenant.shutdown(progress, ShutdownMode::Hard).await {
+                Ok(()) => {
+                    slot_guard.drop_old_value()?;
+                }
+                Err(_barrier) => {
+                    slot_guard.revert();
+                    // this really should not happen, at all, unless shutdown was already going?
+                    anyhow::bail!("Cannot restart Tenant, already shutting down");
+                }
             }
-            Err(_barrier) => {
-                slot_guard.revert();
-                // this really should not happen, at all, unless shutdown was already going?
-                anyhow::bail!("Cannot restart Tenant, already shutting down");
+
+            let tenant_path = self.conf.tenant_path(&tenant_shard_id);
+            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+
+            let shard_identity = config.shard;
+            let tenant = tenant_spawn(
+                self.conf,
+                tenant_shard_id,
+                &tenant_path,
+                self.resources.clone(),
+                AttachedTenantConf::try_from(config)?,
+                shard_identity,
+                None,
+                SpawnMode::Eager,
+                ctx,
+            )?;
+
+            {
+                let mut g = tenant.ongoing_timeline_detach.lock().unwrap();
+                assert!(
+                    g.is_none(),
+                    "there cannot be any new timeline detach ancestor on newly created tenant"
+                );
+                *g = Some((attempt.timeline_id, attempt.new_barrier()));
             }
+
+            slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?;
+            tenant
+        } else {
+            tracing::info!("skipping tenant_reset as no changes made required it");
+            tenant
+        };
+
+        if let Some(reparented) = resp.completed() {
+            // finally ask the restarted tenant to complete the detach
+            //
+            // rationale for 9999s: we don't really have a timetable here; if retried, the caller
+            // will get an 503.
+            tenant
+                .wait_to_become_active(std::time::Duration::from_secs(9999))
+                .await
+                .map_err(|e| {
+                    use pageserver_api::models::TenantState;
+                    use GetActiveTenantError::{Cancelled, WillNotBecomeActive};
+                    match e {
+                        Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => {
+                            Error::ShuttingDown
+                        }
+                        other => Error::Unexpected(other.into()),
+                    }
+                })?;
+
+            utils::pausable_failpoint!(
+                "timeline-detach-ancestor::after_activating_before_finding-pausable"
+            );
+
+            let timeline = tenant
+                .get_timeline(attempt.timeline_id, true)
+                .map_err(|_| Error::DetachedNotFoundAfterRestart)?;
+
+            timeline
+                .complete_detaching_timeline_ancestor(&tenant, attempt, ctx)
+                .await
+                .map(|()| reparented)
+                .map_err(|e| e.into())
+        } else {
+            // at least the latest versions have now been downloaded and refreshed; be ready to
+            // retry another time.
+            Err(anyhow::anyhow!(
+                "failed to reparent all candidate timelines, please retry"
+            ))
         }
-
-        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
-
-        let shard_identity = config.shard;
-        let tenant = tenant_spawn(
-            self.conf,
-            tenant_shard_id,
-            &tenant_path,
-            self.resources.clone(),
-            AttachedTenantConf::try_from(config)?,
-            shard_identity,
-            None,
-            SpawnMode::Eager,
-            ctx,
-        )?;
-
-        slot_guard.upsert(TenantSlot::Attached(tenant))?;
-
-        Ok(reparented)
     }
 
     /// A page service client sends a TenantId, and to look up the correct Tenant we must
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 8a76d7532f..b4d7ad1e97 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -736,12 +736,13 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Reparent this timeline to a new parent.
+    ///
+    /// A retryable step of timeline ancestor detach.
     pub(crate) async fn schedule_reparenting_and_wait(
         self: &Arc<Self>,
         new_parent: &TimelineId,
     ) -> anyhow::Result<()> {
-        // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing
-        // and reads the in-memory part we cannot do the detaching like this
         let receiver = {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
@@ -752,17 +753,25 @@ impl RemoteTimelineClient {
                 ));
             };
 
-            upload_queue.dirty.metadata.reparent(new_parent);
-            upload_queue.dirty.lineage.record_previous_ancestor(&prev);
+            let uploaded = &upload_queue.clean.0.metadata;
 
-            self.schedule_index_upload(upload_queue)?;
+            if uploaded.ancestor_timeline().is_none() && !uploaded.ancestor_lsn().is_valid() {
+                // nothing to do
+                None
+            } else {
+                upload_queue.dirty.metadata.reparent(new_parent);
+                upload_queue.dirty.lineage.record_previous_ancestor(&prev);
 
-            self.schedule_barrier0(upload_queue)
+                self.schedule_index_upload(upload_queue)?;
+
+                Some(self.schedule_barrier0(upload_queue))
+            }
         };
 
-        Self::wait_completion0(receiver)
-            .await
-            .context("wait completion")
+        if let Some(receiver) = receiver {
+            Self::wait_completion0(receiver).await?;
+        }
+        Ok(())
     }
 
     /// Schedules uploading a new version of `index_part.json` with the given layers added,
@@ -778,26 +787,30 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
 
-            upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
-            upload_queue.dirty.lineage.record_detaching(&adopted);
+            if upload_queue.clean.0.lineage.detached_previous_ancestor() == Some(adopted) {
+                None
+            } else {
+                upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
+                upload_queue.dirty.lineage.record_detaching(&adopted);
 
-            for layer in layers {
-                upload_queue
-                    .dirty
-                    .layer_metadata
-                    .insert(layer.layer_desc().layer_name(), layer.metadata());
+                for layer in layers {
+                    let prev = upload_queue
+                        .dirty
+                        .layer_metadata
+                        .insert(layer.layer_desc().layer_name(), layer.metadata());
+                    assert!(prev.is_none(), "copied layer existed already {layer}");
+                }
+
+                self.schedule_index_upload(upload_queue)?;
+
+                Some(self.schedule_barrier0(upload_queue))
             }
-
-            self.schedule_index_upload(upload_queue)?;
-
-            let barrier = self.schedule_barrier0(upload_queue);
-            self.launch_queued_tasks(upload_queue);
-            barrier
         };
 
-        Self::wait_completion0(barrier)
-            .await
-            .context("wait completion")
+        if let Some(barrier) = barrier {
+            Self::wait_completion0(barrier).await?;
+        }
+        Ok(())
     }
 
     /// Adds a gc blocking reason for this timeline if one does not exist already.
@@ -873,12 +886,7 @@ impl RemoteTimelineClient {
             let upload_queue = guard.initialized_mut()?;
 
             if let index::GcBlockingReason::DetachAncestor = reason {
-                if !upload_queue
-                    .clean
-                    .0
-                    .lineage
-                    .is_detached_from_original_ancestor()
-                {
+                if !upload_queue.clean.0.lineage.is_detached_from_ancestor() {
                     drop(guard);
                     panic!("cannot complete timeline_ancestor_detach while not detached");
                 }
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 90453b1922..757fb9d032 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -216,26 +216,47 @@ fn is_false(b: &bool) -> bool {
 impl Lineage {
     const REMEMBER_AT_MOST: usize = 100;
 
-    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) {
+    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) -> bool {
         if self.reparenting_history.last() == Some(old_ancestor) {
             // do not re-record it
-            return;
-        }
+            false
+        } else {
+            #[cfg(feature = "testing")]
+            {
+                let existing = self
+                    .reparenting_history
+                    .iter()
+                    .position(|x| x == old_ancestor);
+                assert_eq!(
+                    existing, None,
+                    "we cannot reparent onto and off and onto the same timeline twice"
+                );
+            }
+            let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
 
-        let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
-
-        self.reparenting_history_truncated |= drop_oldest;
-        if drop_oldest {
-            self.reparenting_history.remove(0);
+            self.reparenting_history_truncated |= drop_oldest;
+            if drop_oldest {
+                self.reparenting_history.remove(0);
+            }
+            self.reparenting_history.push(*old_ancestor);
+            true
         }
-        self.reparenting_history.push(*old_ancestor);
     }
 
-    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) {
-        assert!(self.original_ancestor.is_none());
-
-        self.original_ancestor =
-            Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
+    /// Returns true if anything changed.
+    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool {
+        if let Some((id, lsn, _)) = self.original_ancestor {
+            assert_eq!(
+                &(id, lsn),
+                branchpoint,
+                "detaching attempt has to be for the same ancestor we are already detached from"
+            );
+            false
+        } else {
+            self.original_ancestor =
+                Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
+            true
+        }
     }
 
     /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
@@ -247,10 +268,16 @@ impl Lineage {
             .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
     }
 
-    pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
+    /// Returns true if the timeline originally had an ancestor, and no longer has one.
+    pub(crate) fn is_detached_from_ancestor(&self) -> bool {
         self.original_ancestor.is_some()
     }
 
+    /// Returns original ancestor timeline id and lsn that this timeline has been detached from.
+    pub(crate) fn detached_previous_ancestor(&self) -> Option<(TimelineId, Lsn)> {
+        self.original_ancestor.map(|(id, lsn, _)| (id, lsn))
+    }
+
     pub(crate) fn is_reparented(&self) -> bool {
         !self.reparenting_history.is_empty()
     }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 713845e9ac..dbcd704b4e 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -366,14 +366,13 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
             if first {
                 first = false;
 
-                if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel)
-                    .await
-                    .is_err()
-                {
-                    break;
-                }
+                let delays = async {
+                    delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
+                    random_init_delay(period, &cancel).await?;
+                    Ok::<_, Cancelled>(())
+                };
 
-                if random_init_delay(period, &cancel).await.is_err() {
+                if delays.await.is_err() {
                     break;
                 }
             }
@@ -425,7 +424,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 
             warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
 
-            // Sleep
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
                 .await
                 .is_ok()
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 767f5969fc..f1587951c6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4328,18 +4328,34 @@ impl Timeline {
         detach_ancestor::prepare(self, tenant, options, ctx).await
     }
 
-    /// Completes the ancestor detach. This method is to be called while holding the
-    /// TenantManager's tenant slot, so during this method we cannot be deleted nor can any
-    /// timeline be deleted. After this method returns successfully, tenant must be reloaded.
+    /// Second step of detach from ancestor; detaches the `self` from it's current ancestor and
+    /// reparents any reparentable children of previous ancestor.
     ///
-    /// Pageserver receiving a SIGKILL during this operation is not supported (yet).
-    pub(crate) async fn complete_detaching_timeline_ancestor(
+    /// This method is to be called while holding the TenantManager's tenant slot, so during this
+    /// method we cannot be deleted nor can any timeline be deleted. After this method returns
+    /// successfully, tenant must be reloaded.
+    ///
+    /// Final step will be to [`Self::complete_detaching_timeline_ancestor`] after optionally
+    /// resetting the tenant.
+    pub(crate) async fn detach_from_ancestor_and_reparent(
         self: &Arc<Timeline>,
         tenant: &crate::tenant::Tenant,
         prepared: detach_ancestor::PreparedTimelineDetach,
         ctx: &RequestContext,
-    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
-        detach_ancestor::complete(self, tenant, prepared, ctx).await
+    ) -> Result<detach_ancestor::DetachingAndReparenting, anyhow::Error> {
+        detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await
+    }
+
+    /// Final step which unblocks the GC.
+    ///
+    /// The tenant must've been reset if ancestry was modified previously (in tenant manager).
+    pub(crate) async fn complete_detaching_timeline_ancestor(
+        self: &Arc<Timeline>,
+        tenant: &crate::tenant::Tenant,
+        attempt: detach_ancestor::Attempt,
+        ctx: &RequestContext,
+    ) -> Result<(), detach_ancestor::Error> {
+        detach_ancestor::complete(self, tenant, attempt, ctx).await
     }
 
     /// Switch aux file policy and schedule upload to the index part.
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 3b52adc77b..969da2662b 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -5,12 +5,16 @@ use crate::{
     context::{DownloadBehavior, RequestContext},
     task_mgr::TaskKind,
     tenant::{
+        mgr::GetActiveTenantError,
+        remote_timeline_client::index::GcBlockingReason::DetachAncestor,
         storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer},
         Tenant,
     },
     virtual_file::{MaybeFatalIo, VirtualFile},
 };
+use anyhow::Context;
 use pageserver_api::models::detach_ancestor::AncestorDetached;
+use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
@@ -38,6 +42,12 @@ pub(crate) enum Error {
     #[error("remote copying layer failed")]
     CopyFailed(#[source] anyhow::Error),
 
+    #[error("wait for tenant to activate after restarting")]
+    WaitToActivate(#[source] GetActiveTenantError),
+
+    #[error("detached timeline was not found after restart")]
+    DetachedNotFoundAfterRestart,
+
     #[error("unexpected error")]
     Unexpected(#[source] anyhow::Error),
 
@@ -55,6 +65,10 @@ impl From<Error> for ApiError {
             Error::OtherTimelineDetachOngoing(_) => {
                 ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
             }
+            e @ Error::WaitToActivate(_) => {
+                let s = utils::error::report_compact_sources(&e).to_string();
+                ApiError::ResourceUnavailable(s.into())
+            }
             // All of these contain shutdown errors, in fact, it's the most common
             e @ Error::FlushAncestor(_)
             | e @ Error::RewrittenDeltaDownloadFailed(_)
@@ -63,6 +77,7 @@ impl From<Error> for ApiError {
             | e @ Error::CopyFailed(_)
             | e @ Error::Unexpected(_)
             | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
+            Error::DetachedNotFoundAfterRestart => ApiError::NotFound(value.into()),
         }
     }
 }
@@ -96,8 +111,25 @@ impl From<FlushLayerError> for Error {
     }
 }
 
+impl From<GetActiveTenantError> for Error {
+    fn from(value: GetActiveTenantError) -> Self {
+        use pageserver_api::models::TenantState;
+        use GetActiveTenantError::*;
+
+        match value {
+            Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) | SwitchedTenant => {
+                Error::ShuttingDown
+            }
+            WaitForActiveTimeout { .. } | NotFound(_) | Broken(_) | WillNotBecomeActive(_) => {
+                // NotFound seems out-of-place
+                Error::WaitToActivate(value)
+            }
+        }
+    }
+}
+
 pub(crate) enum Progress {
-    Prepared(completion::Completion, PreparedTimelineDetach),
+    Prepared(Attempt, PreparedTimelineDetach),
     Done(AncestorDetached),
 }
 
@@ -121,6 +153,26 @@ impl Default for Options {
     }
 }
 
+/// Represents an across tenant reset exclusive single attempt to detach ancestor.
+#[derive(Debug)]
+pub(crate) struct Attempt {
+    pub(crate) timeline_id: TimelineId,
+
+    _guard: completion::Completion,
+    gate_entered: Option<utils::sync::gate::GateGuard>,
+}
+
+impl Attempt {
+    pub(crate) fn before_reset_tenant(&mut self) {
+        let taken = self.gate_entered.take();
+        assert!(taken.is_some());
+    }
+
+    pub(crate) fn new_barrier(&self) -> completion::Barrier {
+        self._guard.barrier()
+    }
+}
+
 /// See [`Timeline::prepare_to_detach_from_ancestor`]
 pub(super) async fn prepare(
     detached: &Arc<Timeline>,
@@ -135,15 +187,33 @@ pub(super) async fn prepare(
         .as_ref()
         .map(|tl| (tl.clone(), detached.ancestor_lsn))
     else {
-        {
+        let still_in_progress = {
             let accessor = detached.remote_client.initialized_upload_queue()?;
 
             // we are safe to inspect the latest uploaded, because we can only witness this after
             // restart is complete and ancestor is no more.
             let latest = accessor.latest_uploaded_index_part();
-            if !latest.lineage.is_detached_from_original_ancestor() {
+            if latest.lineage.detached_previous_ancestor().is_none() {
                 return Err(NoAncestor);
-            }
+            };
+
+            latest
+                .gc_blocking
+                .as_ref()
+                .is_some_and(|b| b.blocked_by(DetachAncestor))
+        };
+
+        if still_in_progress {
+            // gc is still blocked, we can still reparent and complete.
+            // we are safe to reparent remaining, because they were locked in in the beginning.
+            let attempt = continue_with_blocked_gc(detached, tenant).await?;
+
+            // because the ancestor of detached is already set to none, we have published all
+            // of the layers, so we are still "prepared."
+            return Ok(Progress::Prepared(
+                attempt,
+                PreparedTimelineDetach { layers: Vec::new() },
+            ));
         }
 
         let reparented_timelines = reparented_direct_children(detached, tenant)?;
@@ -164,22 +234,7 @@ pub(super) async fn prepare(
         return Err(TooManyAncestors);
     }
 
-    // before we acquire the gate, we must mark the ancestor as having a detach operation
-    // ongoing which will block other concurrent detach operations so we don't get to ackward
-    // situations where there would be two branches trying to reparent earlier branches.
-    let (guard, barrier) = completion::channel();
-
-    {
-        let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
-        if let Some((tl, other)) = guard.as_ref() {
-            if !other.is_ready() {
-                return Err(OtherTimelineDetachOngoing(*tl));
-            }
-        }
-        *guard = Some((detached.timeline_id, barrier));
-    }
-
-    let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
+    let attempt = start_new_attempt(detached, tenant).await?;
 
     utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");
 
@@ -245,7 +300,8 @@ pub(super) async fn prepare(
     };
 
     // TODO: layers are already sorted by something: use that to determine how much of remote
-    // copies are already done.
+    // copies are already done -- gc is blocked, but a compaction could had happened on ancestor,
+    // which is something to keep in mind if copy skipping is implemented.
     tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers");
 
     // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
@@ -259,29 +315,33 @@ pub(super) async fn prepare(
 
         let mut wrote_any = false;
 
-        let limiter = Arc::new(tokio::sync::Semaphore::new(
-            options.rewrite_concurrency.get(),
-        ));
+        let limiter = Arc::new(Semaphore::new(options.rewrite_concurrency.get()));
 
         for layer in straddling_branchpoint {
             let limiter = limiter.clone();
             let timeline = detached.clone();
             let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download);
 
-            tasks.spawn(async move {
-                let _permit = limiter.acquire().await;
-                let copied =
-                    upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
-                        .await?;
-                Ok(copied)
-            });
+            let span = tracing::info_span!("upload_rewritten_layer", %layer);
+            tasks.spawn(
+                async move {
+                    let _permit = limiter.acquire().await;
+                    let copied =
+                        upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
+                            .await?;
+                    if let Some(copied) = copied.as_ref() {
+                        tracing::info!(%copied, "rewrote and uploaded");
+                    }
+                    Ok(copied)
+                }
+                .instrument(span),
+            );
         }
 
         while let Some(res) = tasks.join_next().await {
             match res {
                 Ok(Ok(Some(copied))) => {
                     wrote_any = true;
-                    tracing::info!(layer=%copied, "rewrote and uploaded");
                     new_layers.push(copied);
                 }
                 Ok(Ok(None)) => {}
@@ -308,7 +368,7 @@ pub(super) async fn prepare(
     }
 
     let mut tasks = tokio::task::JoinSet::new();
-    let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get()));
+    let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get()));
 
     for adopted in rest_of_historic {
         let limiter = limiter.clone();
@@ -342,7 +402,56 @@ pub(super) async fn prepare(
 
     let prepared = PreparedTimelineDetach { layers: new_layers };
 
-    Ok(Progress::Prepared(guard, prepared))
+    Ok(Progress::Prepared(attempt, prepared))
+}
+
+async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
+    let attempt = obtain_exclusive_attempt(detached, tenant)?;
+
+    // insert the block in the index_part.json, if not already there.
+    let _dont_care = tenant
+        .gc_block
+        .insert(
+            detached,
+            crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor,
+        )
+        .await
+        // FIXME: better error
+        .map_err(Error::Unexpected)?;
+
+    Ok(attempt)
+}
+
+async fn continue_with_blocked_gc(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
+    // FIXME: it would be nice to confirm that there is an in-memory version, since we've just
+    // verified there is a persistent one?
+    obtain_exclusive_attempt(detached, tenant)
+}
+
+fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
+    use Error::{OtherTimelineDetachOngoing, ShuttingDown};
+
+    // ensure we are the only active attempt for this tenant
+    let (guard, barrier) = completion::channel();
+    {
+        let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
+        if let Some((tl, other)) = guard.as_ref() {
+            if !other.is_ready() {
+                return Err(OtherTimelineDetachOngoing(*tl));
+            }
+            // FIXME: no test enters here
+        }
+        *guard = Some((detached.timeline_id, barrier));
+    }
+
+    // ensure the gate is still open
+    let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
+
+    Ok(Attempt {
+        timeline_id: detached.timeline_id,
+        _guard: guard,
+        gate_entered: Some(_gate_entered),
+    })
 }
 
 fn reparented_direct_children(
@@ -548,96 +657,207 @@ async fn remote_copy(
         .map_err(CopyFailed)
 }
 
-/// See [`Timeline::complete_detaching_timeline_ancestor`].
-pub(super) async fn complete(
+pub(crate) enum DetachingAndReparenting {
+    /// All of the following timeline ids were reparented and the timeline ancestor detach must be
+    /// marked as completed.
+    Reparented(HashSet<TimelineId>),
+
+    /// Some of the reparentings failed. The timeline ancestor detach must **not** be marked as
+    /// completed.
+    ///
+    /// Nested `must_reset_tenant` is set to true when any restart requiring changes were made.
+    SomeReparentingFailed { must_reset_tenant: bool },
+
+    /// Detaching and reparentings were completed in a previous attempt. Timeline ancestor detach
+    /// must be marked as completed.
+    AlreadyDone(HashSet<TimelineId>),
+}
+
+impl DetachingAndReparenting {
+    pub(crate) fn reset_tenant_required(&self) -> bool {
+        use DetachingAndReparenting::*;
+        match self {
+            Reparented(_) => true,
+            SomeReparentingFailed { must_reset_tenant } => *must_reset_tenant,
+            AlreadyDone(_) => false,
+        }
+    }
+
+    pub(crate) fn completed(self) -> Option<HashSet<TimelineId>> {
+        use DetachingAndReparenting::*;
+        match self {
+            Reparented(x) | AlreadyDone(x) => Some(x),
+            SomeReparentingFailed { .. } => None,
+        }
+    }
+}
+
+/// See [`Timeline::detach_from_ancestor_and_reparent`].
+pub(super) async fn detach_and_reparent(
     detached: &Arc<Timeline>,
     tenant: &Tenant,
     prepared: PreparedTimelineDetach,
     _ctx: &RequestContext,
-) -> Result<HashSet<TimelineId>, anyhow::Error> {
+) -> Result<DetachingAndReparenting, anyhow::Error> {
     let PreparedTimelineDetach { layers } = prepared;
 
-    let ancestor = detached
-        .ancestor_timeline
-        .as_ref()
-        .expect("must still have a ancestor");
-    let ancestor_lsn = detached.get_ancestor_lsn();
+    #[derive(Debug)]
+    enum Ancestor {
+        NotDetached(Arc<Timeline>, Lsn),
+        Detached(Arc<Timeline>, Lsn),
+    }
+
+    let (recorded_branchpoint, still_ongoing) = {
+        let access = detached.remote_client.initialized_upload_queue()?;
+        let latest = access.latest_uploaded_index_part();
+
+        (
+            latest.lineage.detached_previous_ancestor(),
+            latest
+                .gc_blocking
+                .as_ref()
+                .is_some_and(|b| b.blocked_by(DetachAncestor)),
+        )
+    };
+    assert!(
+        still_ongoing,
+        "cannot (detach? reparent)? complete if the operation is not still ongoing"
+    );
+
+    let ancestor = match (detached.ancestor_timeline.as_ref(), recorded_branchpoint) {
+        (Some(ancestor), None) => {
+            assert!(
+                !layers.is_empty(),
+                "there should always be at least one layer to inherit"
+            );
+            Ancestor::NotDetached(ancestor.clone(), detached.ancestor_lsn)
+        }
+        (Some(_), Some(_)) => {
+            panic!(
+                "it should be impossible to get to here without having gone through the tenant reset; if the tenant was reset, then the ancestor_timeline would be None"
+            );
+        }
+        (None, Some((ancestor_id, ancestor_lsn))) => {
+            // it has been either:
+            // - detached but still exists => we can try reparenting
+            // - detached and deleted
+            //
+            // either way, we must complete
+            assert!(
+                layers.is_empty(),
+                "no layers should had been copied as detach is done"
+            );
+
+            let existing = tenant.timelines.lock().unwrap().get(&ancestor_id).cloned();
+
+            if let Some(ancestor) = existing {
+                Ancestor::Detached(ancestor, ancestor_lsn)
+            } else {
+                let direct_children = reparented_direct_children(detached, tenant)?;
+                return Ok(DetachingAndReparenting::AlreadyDone(direct_children));
+            }
+        }
+        (None, None) => {
+            // TODO: make sure there are no `?` before tenant_reset from after a questionmark from
+            // here.
+            panic!(
+            "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor"
+            );
+        }
+    };
 
     // publish the prepared layers before we reparent any of the timelines, so that on restart
     // reparented timelines find layers. also do the actual detaching.
     //
-    // if we crash after this operation, we will at least come up having detached a timeline, but
-    // we cannot go back and reparent the timelines which would had been reparented in normal
-    // execution.
-    //
-    // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
-    // which could give us a completely wrong layer combination.
-    detached
-        .remote_client
-        .schedule_adding_existing_layers_to_index_detach_and_wait(
-            &layers,
-            (ancestor.timeline_id, ancestor_lsn),
-        )
-        .await?;
+    // if we crash after this operation, a retry will allow reparenting the remaining timelines as
+    // gc is blocked.
+
+    let (ancestor, ancestor_lsn, was_detached) = match ancestor {
+        Ancestor::NotDetached(ancestor, ancestor_lsn) => {
+            // this has to complete before any reparentings because otherwise they would not have
+            // layers on the new parent.
+            detached
+                .remote_client
+                .schedule_adding_existing_layers_to_index_detach_and_wait(
+                    &layers,
+                    (ancestor.timeline_id, ancestor_lsn),
+                )
+                .await
+                .context("publish layers and detach ancestor")?;
+
+            tracing::info!(
+                ancestor=%ancestor.timeline_id,
+                %ancestor_lsn,
+                inherited_layers=%layers.len(),
+                "detached from ancestor"
+            );
+            (ancestor, ancestor_lsn, true)
+        }
+        Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false),
+    };
 
     let mut tasks = tokio::task::JoinSet::new();
 
+    // Returns a single permit semaphore which will be used to make one reparenting succeed,
+    // others will fail as if those timelines had been stopped for whatever reason.
+    #[cfg(feature = "testing")]
+    let failpoint_sem = || -> Option<Arc<Semaphore>> {
+        fail::fail_point!("timeline-detach-ancestor::allow_one_reparented", |_| Some(
+            Arc::new(Semaphore::new(1))
+        ));
+        None
+    }();
+
     // because we are now keeping the slot in progress, it is unlikely that there will be any
     // timeline deletions during this time. if we raced one, then we'll just ignore it.
-    tenant
-        .timelines
-        .lock()
-        .unwrap()
-        .values()
-        .filter_map(|tl| {
-            if Arc::ptr_eq(tl, detached) {
-                return None;
-            }
+    {
+        let g = tenant.timelines.lock().unwrap();
+        reparentable_timelines(g.values(), detached, &ancestor, ancestor_lsn)
+            .cloned()
+            .for_each(|timeline| {
+                // important in this scope: we are holding the Tenant::timelines lock
+                let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id);
+                let new_parent = detached.timeline_id;
+                #[cfg(feature = "testing")]
+                let failpoint_sem = failpoint_sem.clone();
 
-            if !tl.is_active() {
-                return None;
-            }
+                tasks.spawn(
+                    async move {
+                        let res = async {
+                            #[cfg(feature = "testing")]
+                            if let Some(failpoint_sem) = failpoint_sem {
+                                let _permit = failpoint_sem.acquire().await.map_err(|_| {
+                                    anyhow::anyhow!(
+                                        "failpoint: timeline-detach-ancestor::allow_one_reparented",
+                                    )
+                                })?;
+                                failpoint_sem.close();
+                            }
 
-            let tl_ancestor = tl.ancestor_timeline.as_ref()?;
-            let is_same = Arc::ptr_eq(ancestor, tl_ancestor);
-            let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
-
-            let is_deleting = tl
-                .delete_progress
-                .try_lock()
-                .map(|flow| !flow.is_not_started())
-                .unwrap_or(true);
-
-            if is_same && is_earlier && !is_deleting {
-                Some(tl.clone())
-            } else {
-                None
-            }
-        })
-        .for_each(|timeline| {
-            // important in this scope: we are holding the Tenant::timelines lock
-            let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id);
-            let new_parent = detached.timeline_id;
-
-            tasks.spawn(
-                async move {
-                    let res = timeline
-                        .remote_client
-                        .schedule_reparenting_and_wait(&new_parent)
+                            timeline
+                                .remote_client
+                                .schedule_reparenting_and_wait(&new_parent)
+                                .await
+                        }
                         .await;
 
-                    match res {
-                        Ok(()) => Some(timeline),
-                        Err(e) => {
-                            // with the use of tenant slot, we no longer expect these.
-                            tracing::warn!("reparenting failed: {e:#}");
-                            None
+                        match res {
+                            Ok(()) => {
+                                tracing::info!("reparented");
+                                Some(timeline)
+                            }
+                            Err(e) => {
+                                // with the use of tenant slot, raced timeline deletion is the most
+                                // likely reason.
+                                tracing::warn!("reparenting failed: {e:#}");
+                                None
+                            }
                         }
                     }
-                }
-                .instrument(span),
-            );
-        });
+                    .instrument(span),
+                );
+            });
+    }
 
     let reparenting_candidates = tasks.len();
     let mut reparented = HashSet::with_capacity(tasks.len());
@@ -645,33 +865,103 @@ pub(super) async fn complete(
     while let Some(res) = tasks.join_next().await {
         match res {
             Ok(Some(timeline)) => {
-                tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
-
                 assert!(
                     reparented.insert(timeline.timeline_id),
                     "duplicate reparenting? timeline_id={}",
                     timeline.timeline_id
                 );
             }
-            Ok(None) => {
-                // lets just ignore this for now. one or all reparented timelines could had
-                // started deletion, and that is fine.
-            }
             Err(je) if je.is_cancelled() => unreachable!("not used"),
-            Err(je) if je.is_panic() => {
-                // ignore; it's better to continue with a single reparenting failing (or even
-                // all of them) in order to get to the goal state.
-                //
-                // these timelines will never be reparentable, but they can be always detached as
-                // separate tree roots.
-            }
+            // just ignore failures now, we can retry
+            Ok(None) => {}
+            Err(je) if je.is_panic() => {}
             Err(je) => tracing::error!("unexpected join error: {je:?}"),
         }
     }
 
-    if reparenting_candidates != reparented.len() {
-        tracing::info!("failed to reparent some candidates");
+    let reparented_all = reparenting_candidates == reparented.len();
+
+    if reparented_all {
+        Ok(DetachingAndReparenting::Reparented(reparented))
+    } else {
+        tracing::info!(
+            reparented = reparented.len(),
+            candidates = reparenting_candidates,
+            "failed to reparent all candidates; they can be retried after the tenant_reset",
+        );
+
+        let must_reset_tenant = !reparented.is_empty() || was_detached;
+        Ok(DetachingAndReparenting::SomeReparentingFailed { must_reset_tenant })
+    }
+}
+
+pub(super) async fn complete(
+    detached: &Arc<Timeline>,
+    tenant: &Tenant,
+    mut attempt: Attempt,
+    _ctx: &RequestContext,
+) -> Result<(), Error> {
+    assert_eq!(detached.timeline_id, attempt.timeline_id);
+
+    if attempt.gate_entered.is_none() {
+        let entered = detached.gate.enter().map_err(|_| Error::ShuttingDown)?;
+        attempt.gate_entered = Some(entered);
+    } else {
+        // Some(gate_entered) means the tenant was not restarted, as is not required
     }
 
-    Ok(reparented)
+    assert!(detached.ancestor_timeline.is_none());
+
+    // this should be an 503 at least...?
+    fail::fail_point!(
+        "timeline-detach-ancestor::complete_before_uploading",
+        |_| Err(Error::Failpoint(
+            "timeline-detach-ancestor::complete_before_uploading"
+        ))
+    );
+
+    tenant
+        .gc_block
+        .remove(
+            detached,
+            crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor,
+        )
+        .await
+        // FIXME: better error
+        .map_err(Error::Unexpected)?;
+
+    Ok(())
+}
+
+/// Query against a locked `Tenant::timelines`.
+fn reparentable_timelines<'a, I>(
+    timelines: I,
+    detached: &'a Arc<Timeline>,
+    ancestor: &'a Arc<Timeline>,
+    ancestor_lsn: Lsn,
+) -> impl Iterator<Item = &'a Arc<Timeline>> + 'a
+where
+    I: Iterator<Item = &'a Arc<Timeline>> + 'a,
+{
+    timelines.filter_map(move |tl| {
+        if Arc::ptr_eq(tl, detached) {
+            return None;
+        }
+
+        let tl_ancestor = tl.ancestor_timeline.as_ref()?;
+        let is_same = Arc::ptr_eq(ancestor, tl_ancestor);
+        let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
+
+        let is_deleting = tl
+            .delete_progress
+            .try_lock()
+            .map(|flow| !flow.is_not_started())
+            .unwrap_or(true);
+
+        if is_same && is_earlier && !is_deleting {
+            Some(tl)
+        } else {
+            None
+        }
+    })
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index fe582cf0e2..ee8e9ac5a1 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2989,6 +2989,7 @@ impl Service {
             );
 
             let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+
             client
                 .timeline_detach_ancestor(tenant_shard_id, timeline_id)
                 .await
@@ -3005,7 +3006,8 @@ impl Service {
                         Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
                             ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
                         }
-                        // rest can be mapped
+                        // rest can be mapped as usual
+                        // FIXME: this converts some 500 to 409 which is not per openapi
                         other => passthrough_api_error(&node, other),
                     }
                 })
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 4e409eeb17..902457c2ac 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -5,7 +5,7 @@ import time
 from concurrent.futures import ThreadPoolExecutor
 from queue import Empty, Queue
 from threading import Barrier
-from typing import List, Tuple
+from typing import List, Set, Tuple
 
 import pytest
 from fixtures.common_types import Lsn, TimelineId
@@ -807,22 +807,24 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
 
     What remains not tested by this:
     - shutdown winning over complete
-
-    Shutdown winning over complete needs gc blocking and reparenting any left-overs on retry.
     """
 
     if sharded and mode == "delete_tenant":
         # the shared/exclusive lock for tenant is blocking this:
         # timeline detach ancestor takes shared, delete tenant takes exclusive
-        pytest.skip(
-            "tenant deletion while timeline ancestor detach is underway is not supported yet"
-        )
+        pytest.skip("tenant deletion while timeline ancestor detach is underway cannot happen")
 
     shard_count = 2 if sharded else 1
 
     neon_env_builder.num_pageservers = shard_count
 
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count if sharded else None)
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count if sharded else None,
+        initial_tenant_conf={
+            "gc_period": "1s",
+            "lsn_lease_length": "0s",
+        },
+    )
 
     for ps in env.pageservers:
         ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
@@ -831,7 +833,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
 
     detached_timeline = env.neon_cli.create_branch("detached soon", "main")
 
-    failpoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
+    pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
 
     env.storage_controller.reconcile_until_idle()
     shards = env.storage_controller.locate(env.initial_tenant)
@@ -843,13 +845,20 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
 
     victim = pageservers[int(shards[-1]["node_id"])]
     victim_http = victim.http_client()
-    victim_http.configure_failpoints((failpoint, "pause"))
+    victim_http.configure_failpoints((pausepoint, "pause"))
 
     def detach_ancestor():
         target.detach_ancestor(env.initial_tenant, detached_timeline)
 
-    def at_failpoint() -> Tuple[str, LogCursor]:
-        return victim.assert_log_contains(f"at failpoint {failpoint}")
+    def at_failpoint() -> LogCursor:
+        msg, offset = victim.assert_log_contains(f"at failpoint {pausepoint}")
+        log.info(f"found {msg}")
+        msg, offset = victim.assert_log_contains(
+            ".* gc_loop.*: Skipping GC: .*",
+            offset,
+        )
+        log.info(f"found {msg}")
+        return offset
 
     def start_delete():
         if mode == "delete_timeline":
@@ -882,23 +891,44 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
     with ThreadPoolExecutor(max_workers=2) as pool:
         try:
             fut = pool.submit(detach_ancestor)
-            _, offset = wait_until(10, 1.0, at_failpoint)
+            offset = wait_until(10, 1.0, at_failpoint)
 
             delete = pool.submit(start_delete)
 
-            wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset))
+            offset = wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset))
 
-            victim_http.configure_failpoints((failpoint, "off"))
+            victim_http.configure_failpoints((pausepoint, "off"))
 
             delete.result()
 
             assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}"
 
+            # TODO: match the error
             with pytest.raises(PageserverApiException) as exc:
                 fut.result()
+            log.info(f"TODO: match this error: {exc.value}")
             assert exc.value.status_code == 503
         finally:
-            victim_http.configure_failpoints((failpoint, "off"))
+            victim_http.configure_failpoints((pausepoint, "off"))
+
+    if mode != "delete_timeline":
+        return
+
+    # make sure the gc is unblocked
+    time.sleep(2)
+    victim.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset)
+
+    if not sharded:
+        # we have the other node only while sharded
+        return
+
+    other = pageservers[int(shards[0]["node_id"])]
+    log.info(f"other is {other.id}")
+    _, offset = other.assert_log_contains(
+        ".*INFO request\\{method=PUT path=/v1/tenant/\\S+/timeline/\\S+/detach_ancestor .*\\}: Request handled, status: 200 OK",
+    )
+    # this might be a lot earlier than the victims line, but that is okay.
+    _, offset = other.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset)
 
 
 @pytest.mark.parametrize("mode", ["delete_reparentable_timeline"])
@@ -915,7 +945,9 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
 
     assert (
         mode == "delete_reparentable_timeline"
-    ), "only one now, but we could have the create just as well, need gc blocking"
+    ), "only one now, but creating reparentable timelines cannot be supported even with gc blocking"
+    # perhaps it could be supported by always doing this for the shard0 first, and after that for others.
+    # when we run shard0 to completion, we can use it's timelines to restrict which can be reparented.
 
     shard_count = 2
     neon_env_builder.num_pageservers = shard_count
@@ -1048,10 +1080,267 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
             victim_http.configure_failpoints((pausepoint, "off"))
 
 
+def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: NeonEnvBuilder):
+    """
+    Using a failpoint, force the completion step of timeline ancestor detach to
+    fail after reparenting a single timeline.
+
+    Retrying should try reparenting until all reparentings are done, all the
+    time blocking gc even across restarts (first round).
+
+    A completion failpoint is used to inhibit completion on second to last
+    round.
+
+    On last round, the completion uses a path where no reparentings can happen
+    because original ancestor is deleted, and there is a completion to unblock
+    gc without restart.
+    """
+
+    # to get the remote storage metrics
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "1s",
+            "lsn_lease_length": "0s",
+        }
+    )
+
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    env.pageserver.allowed_errors.extend(
+        [
+            ".* reparenting failed: failpoint: timeline-detach-ancestor::allow_one_reparented",
+            ".* Error processing HTTP request: InternalServerError\\(failed to reparent all candidate timelines, please retry",
+            ".* Error processing HTTP request: InternalServerError\\(failpoint: timeline-detach-ancestor::complete_before_uploading",
+        ]
+    )
+
+    http = env.pageserver.http_client()
+
+    def remote_storage_copy_requests():
+        return http.get_metric_value(
+            "remote_storage_s3_request_seconds_count",
+            {"request_type": "copy_object", "result": "ok"},
+        )
+
+    def reparenting_progress(timelines: List[TimelineId]) -> Tuple[int, Set[TimelineId]]:
+        reparented = 0
+        not_reparented = set()
+        for timeline in timelines:
+            detail = http.timeline_detail(env.initial_tenant, timeline)
+            ancestor = TimelineId(detail["ancestor_timeline_id"])
+            if ancestor == detached:
+                reparented += 1
+            else:
+                not_reparented.add(timeline)
+        return (reparented, not_reparented)
+
+    # main ------A-----B-----C-----D-----E> lsn
+    timelines = []
+    with env.endpoints.create_start("main") as ep:
+        for counter in range(5):
+            ep.safe_psql(
+                f"create table foo_{counter} as select i::bigint from generate_series(1, 10000) t(i)"
+            )
+            branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+            http.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
+            branch = env.neon_cli.create_branch(
+                f"branch_{counter}", "main", ancestor_start_lsn=branch_lsn
+            )
+            timelines.append(branch)
+
+        flush_ep_to_pageserver(env, ep, env.initial_tenant, env.initial_timeline)
+
+    # detach "E" which has most reparentable timelines under it
+    detached = timelines.pop()
+    assert len(timelines) == 4
+
+    http = http.without_status_retrying()
+
+    http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return"))
+
+    not_reparented: Set[TimelineId] = set()
+    # tracked offset in the pageserver log which is at least at the most recent activation
+    offset = None
+
+    def try_detach():
+        with pytest.raises(
+            PageserverApiException,
+            match=".*failed to reparent all candidate timelines, please retry",
+        ) as exc:
+            http.detach_ancestor(env.initial_tenant, detached)
+        assert exc.value.status_code == 500
+
+    # first round -- do more checking to make sure the gc gets paused
+    try_detach()
+
+    assert (
+        http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None
+    ), "first round should had detached 'detached'"
+
+    reparented, not_reparented = reparenting_progress(timelines)
+    assert reparented == 1
+
+    time.sleep(2)
+    _, offset = env.pageserver.assert_log_contains(
+        ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request",
+        offset,
+    )
+    _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset)
+    _, offset = env.pageserver.assert_log_contains(
+        ".* gc_loop.*: Skipping GC: .*",
+        offset,
+    )
+    metric = remote_storage_copy_requests()
+    assert metric != 0
+    # make sure the gc blocking is persistent over a restart
+    env.pageserver.restart()
+    env.pageserver.quiesce_tenants()
+    time.sleep(2)
+    _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset)
+    assert env.pageserver.log_contains(".* gc_loop.*: [0-9] timelines need GC", offset) is None
+    _, offset = env.pageserver.assert_log_contains(
+        ".* gc_loop.*: Skipping GC: .*",
+        offset,
+    )
+    # restore failpoint for the next reparented
+    http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return"))
+
+    reparented_before = reparented
+
+    # do two more rounds
+    for _ in range(2):
+        try_detach()
+
+        assert (
+            http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None
+        ), "first round should had detached 'detached'"
+
+        reparented, not_reparented = reparenting_progress(timelines)
+        assert reparented == reparented_before + 1
+        reparented_before = reparented
+
+        _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset)
+        metric = remote_storage_copy_requests()
+        assert metric == 0, "copies happen in the first round"
+
+    assert offset is not None
+    assert len(not_reparented) == 1
+
+    http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "return"))
+
+    # almost final round, the failpoint is hit no longer as there is only one reparented and one always gets to succeed.
+    # the tenant is restarted once more, but we fail during completing.
+    with pytest.raises(
+        PageserverApiException, match=".* timeline-detach-ancestor::complete_before_uploading"
+    ) as exc:
+        http.detach_ancestor(env.initial_tenant, detached)
+    assert exc.value.status_code == 500
+    _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset)
+
+    # delete the previous ancestor to take a different path to completion. all
+    # other tests take the "detach? reparent complete", but this only hits
+    # "complete".
+    http.timeline_delete(env.initial_tenant, env.initial_timeline)
+    wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline, 20)
+
+    http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off"))
+
+    reparented_resp = http.detach_ancestor(env.initial_tenant, detached)
+    assert reparented_resp == set(timelines)
+    # no need to quiesce_tenants anymore, because completion does that
+
+    reparented, not_reparented = reparenting_progress(timelines)
+    assert reparented == len(timelines)
+
+    time.sleep(2)
+    assert (
+        env.pageserver.log_contains(".*: attach finished, activating", offset) is None
+    ), "there should be no restart with the final detach_ancestor as it only completed"
+
+    # gc is unblocked
+    env.pageserver.assert_log_contains(".* gc_loop.*: 5 timelines need GC", offset)
+
+    metric = remote_storage_copy_requests()
+    assert metric == 0
+
+
+def test_timeline_is_deleted_before_timeline_detach_ancestor_completes(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Make sure that a timeline deleted after restart will unpause gc blocking.
+    """
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "1s",
+            "lsn_lease_length": "0s",
+        }
+    )
+
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    http = env.pageserver.http_client()
+
+    detached = env.neon_cli.create_branch("detached")
+
+    failpoint = "timeline-detach-ancestor::after_activating_before_finding-pausable"
+
+    http.configure_failpoints((failpoint, "pause"))
+
+    def detach_and_get_stuck():
+        return http.detach_ancestor(env.initial_tenant, detached)
+
+    def request_processing_noted_in_log():
+        _, offset = env.pageserver.assert_log_contains(
+            ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request",
+        )
+        return offset
+
+    def delete_detached():
+        return http.timeline_delete(env.initial_tenant, detached)
+
+    try:
+        with ThreadPoolExecutor(max_workers=1) as pool:
+            detach = pool.submit(detach_and_get_stuck)
+
+            offset = wait_until(10, 1.0, request_processing_noted_in_log)
+
+            # make this named fn tor more clear failure test output logging
+            def pausepoint_hit_with_gc_paused() -> LogCursor:
+                env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+                _, at = env.pageserver.assert_log_contains(
+                    ".* gc_loop.*: Skipping GC: .*",
+                    offset,
+                )
+                return at
+
+            offset = wait_until(10, 1.0, pausepoint_hit_with_gc_paused)
+
+            delete_detached()
+
+            wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0)
+
+            http.configure_failpoints((failpoint, "off"))
+
+            with pytest.raises(PageserverApiException) as exc:
+                detach.result()
+
+            # FIXME: this should be 404 but because there is another Anyhow conversion it is 500
+            assert exc.value.status_code == 500
+            env.pageserver.allowed_errors.append(
+                ".*Error processing HTTP request: InternalServerError\\(detached timeline was not found after restart"
+            )
+    finally:
+        http.configure_failpoints((failpoint, "off"))
+
+    # make sure gc has been unblocked
+    time.sleep(2)
+
+    env.pageserver.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset)
+
+
 # TODO:
-# - after starting the operation, pageserver is shutdown, restarted
-# - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited
-# - deletion of reparented while reparenting should fail once, then succeed (?)
 # - branch near existing L1 boundary, image layers?
 # - investigate: why are layers started at uneven lsn? not just after branching, but in general.
 #

From 0f43b7c51b622e59d7485e52ac572378dcb78afc Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 15:31:55 -0500
Subject: [PATCH 30/62] Loosen type on PgProtocol::safe_psql(queries:)

Using Iterable allows us to also use tuples, among other things.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 961dbde95c..aaa1f21997 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -24,7 +24,7 @@ from functools import cached_property, partial
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
+from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union, cast
 from urllib.parse import quote, urlparse
 
 import asyncpg
@@ -388,7 +388,7 @@ class PgProtocol:
         return self.safe_psql_many([query], **kwargs)[0]
 
     def safe_psql_many(
-        self, queries: List[str], log_query=True, **kwargs: Any
+        self, queries: Iterable[str], log_query=True, **kwargs: Any
     ) -> List[List[Tuple[Any, ...]]]:
         """
         Execute queries against the node and return all rows.

From c624317b0e2ff73c6c01904e4d883c256c078f22 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 15:34:10 -0500
Subject: [PATCH 31/62] Decode the database name in SQL/HTTP connections

A url::Url does not hand you back a URL decoded value for path values,
so we must decode them ourselves.

Link: https://docs.rs/url/2.5.2/url/struct.Url.html#method.path
Link: https://docs.rs/url/2.5.2/url/struct.Url.html#method.path_segments
Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 proxy/src/serverless/sql_over_http.rs |  4 +++-
 test_runner/regress/test_proxy.py     | 26 ++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index e5b6536328..c41df07a4d 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -34,6 +34,7 @@ use tracing::error;
 use tracing::info;
 use typed_json::json;
 use url::Url;
+use urlencoding;
 use utils::http::error::ApiError;
 
 use crate::auth::backend::ComputeUserInfo;
@@ -168,7 +169,8 @@ fn get_conn_info(
         .path_segments()
         .ok_or(ConnInfoError::MissingDbName)?;
 
-    let dbname: DbName = url_path.next().ok_or(ConnInfoError::InvalidDbName)?.into();
+    let dbname: DbName =
+        urlencoding::decode(url_path.next().ok_or(ConnInfoError::InvalidDbName)?)?.into();
     ctx.set_dbname(dbname.clone());
 
     let username = RoleName::from(urlencoding::decode(connection_url.username())?);
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index f446f4f200..d2b8c2ed8b 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -2,6 +2,7 @@ import asyncio
 import json
 import subprocess
 import time
+import urllib.parse
 from typing import Any, List, Optional, Tuple
 
 import psycopg2
@@ -275,6 +276,31 @@ def test_sql_over_http(static_proxy: NeonProxy):
     assert res["rowCount"] is None
 
 
+def test_sql_over_http_db_name_with_space(static_proxy: NeonProxy):
+    db = "db with spaces"
+    static_proxy.safe_psql_many(
+        (
+            f'create database "{db}"',
+            "create role http with login password 'http' superuser",
+        )
+    )
+
+    def q(sql: str, params: Optional[List[Any]] = None) -> Any:
+        params = params or []
+        connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/{urllib.parse.quote(db)}"
+        response = requests.post(
+            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+            data=json.dumps({"query": sql, "params": params}),
+            headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr},
+            verify=str(static_proxy.test_output_dir / "proxy.crt"),
+        )
+        assert response.status_code == 200, response.text
+        return response.json()
+
+    rows = q("select 42 as answer")["rows"]
+    assert rows == [{"answer": 42}]
+
+
 def test_sql_over_http_output_options(static_proxy: NeonProxy):
     static_proxy.safe_psql("create role http2 with login password 'http2' superuser")
 

From 7a1736ddcf0ed0e60bbb8b5a030c8e5349041c37 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 14 Aug 2024 08:13:20 +0300
Subject: [PATCH 32/62] Preserve HEAP_COMBOCID when restoring t_cid from WAL
 (#8503)

## Problem

See https://github.com/neondatabase/neon/issues/8499

## Summary of changes

Save HEAP_COMBOCID flag in WAL and do not clear it in redo handlers.

Related Postgres PRs:
https://github.com/neondatabase/postgres/pull/457
https://github.com/neondatabase/postgres/pull/458
https://github.com/neondatabase/postgres/pull/459


## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pgxn/neon_rmgr/neon_rmgr.c           |  16 +--
 test_runner/regress/test_combocid.py | 139 +++++++++++++++++++++++++++
 vendor/postgres-v14                  |   2 +-
 vendor/postgres-v15                  |   2 +-
 vendor/postgres-v16                  |   2 +-
 vendor/revisions.json                |   6 +-
 6 files changed, 154 insertions(+), 13 deletions(-)
 create mode 100644 test_runner/regress/test_combocid.py

diff --git a/pgxn/neon_rmgr/neon_rmgr.c b/pgxn/neon_rmgr/neon_rmgr.c
index 496ca08c08..c3f726db84 100644
--- a/pgxn/neon_rmgr/neon_rmgr.c
+++ b/pgxn/neon_rmgr/neon_rmgr.c
@@ -186,7 +186,7 @@ static void
 fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
 {
 	*infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
-				   HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
+				   HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK | HEAP_COMBOCID);
 	*infomask2 &= ~HEAP_KEYS_UPDATED;
 
 	if (infobits & XLHL_XMAX_IS_MULTI)
@@ -195,6 +195,8 @@ fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
 		*infomask |= HEAP_XMAX_LOCK_ONLY;
 	if (infobits & XLHL_XMAX_EXCL_LOCK)
 		*infomask |= HEAP_XMAX_EXCL_LOCK;
+	if (infobits & XLHL_COMBOCID)
+		*infomask |= HEAP_COMBOCID;
 	/* note HEAP_XMAX_SHR_LOCK isn't considered here */
 	if (infobits & XLHL_XMAX_KEYSHR_LOCK)
 		*infomask |= HEAP_XMAX_KEYSHR_LOCK;
@@ -284,7 +286,7 @@ redo_neon_heap_insert(XLogReaderState *record)
 		htup->t_infomask = xlhdr.t_infomask;
 		htup->t_hoff = xlhdr.t_hoff;
 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
-		HeapTupleHeaderSetCmin(htup, xlhdr.t_cid);
+		htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid;
 		htup->t_ctid = target_tid;
 
 		if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
@@ -373,7 +375,7 @@ redo_neon_heap_delete(XLogReaderState *record)
 			HeapTupleHeaderSetXmax(htup, xlrec->xmax);
 		else
 			HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
-		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
+		htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid;
 
 		/* Mark the page as a candidate for pruning */
 		PageSetPrunable(page, XLogRecGetXid(record));
@@ -490,7 +492,7 @@ redo_neon_heap_update(XLogReaderState *record, bool hot_update)
 		fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
 								   &htup->t_infomask2);
 		HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
-		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
+		htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid;
 		/* Set forward chain link in t_ctid */
 		htup->t_ctid = newtid;
 
@@ -623,7 +625,7 @@ redo_neon_heap_update(XLogReaderState *record, bool hot_update)
 		htup->t_hoff = xlhdr.t_hoff;
 
 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
-		HeapTupleHeaderSetCmin(htup, xlhdr.t_cid);
+		htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid;
 		HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
 		/* Make sure there is no forward chain link in t_ctid */
 		htup->t_ctid = newtid;
@@ -728,7 +730,7 @@ redo_neon_heap_lock(XLogReaderState *record)
 						   offnum);
 		}
 		HeapTupleHeaderSetXmax(htup, xlrec->xmax);
-		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
+		htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid;
 		PageSetLSN(page, lsn);
 		MarkBufferDirty(buffer);
 	}
@@ -840,7 +842,7 @@ redo_neon_heap_multi_insert(XLogReaderState *record)
 			htup->t_infomask = xlhdr->t_infomask;
 			htup->t_hoff = xlhdr->t_hoff;
 			HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
-			HeapTupleHeaderSetCmin(htup, xlrec->t_cid);
+			htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid;
 			ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
 			ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
 
diff --git a/test_runner/regress/test_combocid.py b/test_runner/regress/test_combocid.py
new file mode 100644
index 0000000000..6d2567b7ee
--- /dev/null
+++ b/test_runner/regress/test_combocid.py
@@ -0,0 +1,139 @@
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "shared_buffers='1MB'",
+        ],
+    )
+
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    n_records = 1000
+
+    cur.execute("CREATE EXTENSION neon_test_utils")
+
+    cur.execute("create table t(id integer, val integer)")
+
+    cur.execute("begin")
+    cur.execute("insert into t values (1, 0)")
+    cur.execute("insert into t values (2, 0)")
+    cur.execute(f"insert into t select g, 0 from generate_series(3,{n_records}) g")
+
+    # Open a cursor that scroll it halfway through
+    cur.execute("DECLARE c1 NO SCROLL CURSOR WITHOUT HOLD FOR SELECT * FROM t")
+    cur.execute("fetch 500 from c1")
+    rows = cur.fetchall()
+    assert len(rows) == 500
+
+    # Perform specified operation
+    cur.execute(op)
+
+    # Clear the cache, so that we exercise reconstructing the pages
+    # from WAL
+    cur.execute("SELECT clear_buffer_cache()")
+
+    # Check that the cursor opened earlier still works. If the
+    # combocids are not restored correctly, it won't.
+    cur.execute("fetch all from c1")
+    rows = cur.fetchall()
+    assert len(rows) == 500
+
+    cur.execute("rollback")
+
+
+def test_combocid_delete(neon_env_builder: NeonEnvBuilder):
+    do_combocid_op(neon_env_builder, "delete from t")
+
+
+def test_combocid_update(neon_env_builder: NeonEnvBuilder):
+    do_combocid_op(neon_env_builder, "update t set val=val+1")
+
+
+def test_combocid_lock(neon_env_builder: NeonEnvBuilder):
+    do_combocid_op(neon_env_builder, "select * from t for update")
+
+
+def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "shared_buffers='1MB'",
+        ],
+    )
+
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    n_records = 1000
+
+    cur.execute("CREATE EXTENSION neon_test_utils")
+
+    cur.execute("create table t(id integer, val integer)")
+    file_path = f"{endpoint.pg_data_dir_path()}/t.csv"
+    cur.execute(f"insert into t select g, 0 from generate_series(1,{n_records}) g")
+    cur.execute(f"copy t to '{file_path}'")
+    cur.execute("truncate table t")
+
+    cur.execute("begin")
+    cur.execute(f"copy t from '{file_path}'")
+
+    # Open a cursor that scroll it halfway through
+    cur.execute("DECLARE c1 NO SCROLL CURSOR WITHOUT HOLD FOR SELECT * FROM t")
+    cur.execute("fetch 500 from c1")
+    rows = cur.fetchall()
+    assert len(rows) == 500
+
+    # Delete all the rows. Because all of the rows were inserted earlier in the
+    # same transaction, all the rows will get a combocid.
+    cur.execute("delete from t")
+    # Clear the cache, so that we exercise reconstructing the pages
+    # from WAL
+    cur.execute("SELECT clear_buffer_cache()")
+
+    # Check that the cursor opened earlier still works. If the
+    # combocids are not restored correctly, it won't.
+    cur.execute("fetch all from c1")
+    rows = cur.fetchall()
+    assert len(rows) == 500
+
+    cur.execute("rollback")
+
+
+def test_combocid(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start("main")
+
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    n_records = 100000
+
+    cur.execute("create table t(id integer, val integer)")
+    cur.execute(f"insert into t values (generate_series(1,{n_records}), 0)")
+
+    cur.execute("begin")
+
+    cur.execute("update t set val=val+1")
+    assert cur.rowcount == n_records
+    cur.execute("update t set val=val+1")
+    assert cur.rowcount == n_records
+    cur.execute("update t set val=val+1")
+    assert cur.rowcount == n_records
+
+    cur.execute("delete from t")
+    assert cur.rowcount == n_records
+    cur.execute("delete from t")
+    assert cur.rowcount == 0
+
+    cur.execute(f"insert into t values (generate_series(1,{n_records}), 0)")
+    cur.execute("update t set val=val+1")
+    assert cur.rowcount == n_records
+    cur.execute("update t set val=val+1")
+    assert cur.rowcount == n_records
+    cur.execute("update t set val=val+1")
+    assert cur.rowcount == n_records
+
+    cur.execute("rollback")
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index a48faca1d9..3fd7a45f8a 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit a48faca1d9aef59649dd1bf34bc1b6303fa3489e
+Subproject commit 3fd7a45f8aae85c080df6329e3c85887b7f3a737
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 39c51c33b3..46b4b235f3 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 39c51c33b383239c78b86afe561679f980e44842
+Subproject commit 46b4b235f38413ab5974bb22c022f9b829257674
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 5ea106b258..47a9122a5a 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 5ea106b2583285849784e774b39d62eb2615bd5d
+Subproject commit 47a9122a5a150a3217fafd3f3d4fe8e020ea718a
diff --git a/vendor/revisions.json b/vendor/revisions.json
index f983407268..6e3e489b5d 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,14 +1,14 @@
 {
   "v16": [
     "16.3",
-    "5ea106b2583285849784e774b39d62eb2615bd5d"
+    "47a9122a5a150a3217fafd3f3d4fe8e020ea718a"
   ],
   "v15": [
     "15.7",
-    "39c51c33b383239c78b86afe561679f980e44842"
+    "46b4b235f38413ab5974bb22c022f9b829257674"
   ],
   "v14": [
     "14.12",
-    "a48faca1d9aef59649dd1bf34bc1b6303fa3489e"
+    "3fd7a45f8aae85c080df6329e3c85887b7f3a737"
   ]
 }

From 4049d2b7e1a73ed1dfbadbe759fd3b1f4247606b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 14 Aug 2024 09:29:06 +0100
Subject: [PATCH 33/62] scrubber: fix spurious "Missed some shards" errors
 (#8661)

## Problem

The storage scrubber was reporting warnings for lots of timelines like:
```
WARN Missed some shards at count ShardCount(0) tenant_id=25eb7a83d9a2f90ac0b765b6ca84cf4c
```

These were spurious: these tenants are fine. There was a bug in
accumulating the ShardIndex for each tenant, whereby multiple timelines
would lead us to add the same ShardIndex more than one.

Closes: #8646

## Summary of changes

- Accumulate ShardIndex in a BTreeSet instead of a Vec
- Extend the test to reproduce the issue
---
 pageserver/src/http/routes.rs                 |  2 ++
 .../src/pageserver_physical_gc.rs             | 13 ++++----
 test_runner/regress/test_storage_scrubber.py  | 30 ++++++++++++-------
 3 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6f7480cc6c..fd4ead9d47 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1787,9 +1787,11 @@ async fn timeline_checkpoint_handler(
         }
 
         if wait_until_uploaded {
+            tracing::info!("Waiting for uploads to complete...");
             timeline.remote_client.wait_completion().await
             // XXX map to correct ApiError for the cases where it's due to shutdown
             .context("wait completion").map_err(ApiError::InternalServerError)?;
+            tracing::info!("Uploads completed up to {}", timeline.get_remote_consistent_lsn_projected().unwrap_or(Lsn(0)));
         }
 
         json_response(StatusCode::OK, ())
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index ff230feae3..c8b1ed49f4 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -1,4 +1,4 @@
-use std::collections::{BTreeMap, HashMap};
+use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 
@@ -117,7 +117,7 @@ use refs::AncestorRefs;
 // - Are there any refs to ancestor shards' layers?
 #[derive(Default)]
 struct TenantRefAccumulator {
-    shards_seen: HashMap<TenantId, Vec<ShardIndex>>,
+    shards_seen: HashMap<TenantId, BTreeSet<ShardIndex>>,
 
     // For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to
     ancestor_ref_shards: AncestorRefs,
@@ -130,7 +130,7 @@ impl TenantRefAccumulator {
             .shards_seen
             .entry(ttid.tenant_shard_id.tenant_id)
             .or_default())
-        .push(this_shard_idx);
+        .insert(this_shard_idx);
 
         let mut ancestor_refs = Vec::new();
         for (layer_name, layer_metadata) in &index_part.layer_metadata {
@@ -154,7 +154,7 @@ impl TenantRefAccumulator {
         summary: &mut GcSummary,
     ) -> (Vec<TenantShardId>, AncestorRefs) {
         let mut ancestors_to_gc = Vec::new();
-        for (tenant_id, mut shard_indices) in self.shards_seen {
+        for (tenant_id, shard_indices) in self.shards_seen {
             // Find the highest shard count
             let latest_count = shard_indices
                 .iter()
@@ -162,6 +162,7 @@ impl TenantRefAccumulator {
                 .max()
                 .expect("Always at least one shard");
 
+            let mut shard_indices = shard_indices.iter().collect::<Vec<_>>();
             let (mut latest_shards, ancestor_shards) = {
                 let at =
                     itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count);
@@ -174,7 +175,7 @@ impl TenantRefAccumulator {
             // to scan the S3 bucket halfway through a shard split.
             if latest_shards.len() != latest_count.count() as usize {
                 // This should be extremely rare, so we warn on it.
-                tracing::warn!(%tenant_id, "Missed some shards at count {:?}", latest_count);
+                tracing::warn!(%tenant_id, "Missed some shards at count {:?}: {latest_shards:?}", latest_count);
                 continue;
             }
 
@@ -212,7 +213,7 @@ impl TenantRefAccumulator {
                         .iter()
                         .map(|s| s.tenant_shard_id.to_index())
                         .collect();
-                    if controller_indices != latest_shards {
+                    if !controller_indices.iter().eq(latest_shards.iter().copied()) {
                         tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})");
                         continue;
                     }
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 388f6a9e92..2844d1b1d2 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -204,6 +204,11 @@ def test_scrubber_physical_gc_ancestors(
         },
     )
 
+    # Create an extra timeline, to ensure the scrubber isn't confused by multiple timelines
+    env.storage_controller.pageserver_api().timeline_create(
+        env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
+    )
+
     # Make sure the original shard has some layers
     workload = Workload(env, tenant_id, timeline_id)
     workload.init()
@@ -214,6 +219,11 @@ def test_scrubber_physical_gc_ancestors(
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
     env.storage_controller.reconcile_until_idle()  # Move shards to their final locations immediately
 
+    # Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors
+    env.storage_controller.pageserver_api().timeline_create(
+        env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
+    )
+
     # Make sure child shards have some layers.  Do not force upload, because the test helper calls checkpoint, which
     # compacts, and we only want to do tha explicitly later in the test.
     workload.write_rows(100, upload=False)
@@ -305,10 +315,19 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
     # Make sure the original shard has some layers
     workload = Workload(env, tenant_id, timeline_id)
     workload.init()
-    workload.write_rows(100)
+    workload.write_rows(100, upload=False)
+    workload.stop()
 
     new_shard_count = 4
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
+        ps.http_client().timeline_checkpoint(
+            shard, timeline_id, compact=False, wait_until_uploaded=True
+        )
+
+        ps.http_client().deletion_queue_flush(execute=True)
 
     # Create a second timeline so that when we delete the first one, child shards still have some content in S3.
     #
@@ -319,15 +338,6 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
         PgVersion.NOT_SET, tenant_id, other_timeline_id
     )
 
-    # Write after split so that child shards have some indices in S3
-    workload.write_rows(100, upload=False)
-    for shard in shards:
-        ps = env.get_tenant_pageserver(shard)
-        log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
-        ps.http_client().timeline_checkpoint(
-            shard, timeline_id, compact=False, wait_until_uploaded=True
-        )
-
     # The timeline still exists in child shards and they reference its layers, so scrubbing
     # now shouldn't delete anything.
     gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")

From 485d76ac622dcb8d847ecce9eef2ca714768e7df Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 14 Aug 2024 12:16:18 +0300
Subject: [PATCH 34/62] timeline_detach_ancestor: adjust error handling (#8528)

With additional phases from #8430 the `detach_ancestor::Error` became
untenable. Split it up into phases, and introduce laundering for
remaining `anyhow::Error` to propagate them as most often
`Error::ShuttingDown`.

Additionally, complete FIXMEs.

Cc: #6994
---
 libs/remote_storage/src/error.rs              |   4 +
 pageserver/src/http/routes.rs                 |   3 +-
 pageserver/src/tenant/mgr.rs                  | 100 +++----
 pageserver/src/tenant/storage_layer/layer.rs  |   6 +
 pageserver/src/tenant/timeline.rs             |   2 +-
 .../src/tenant/timeline/detach_ancestor.rs    | 181 ++++++-------
 storage_controller/src/service.rs             |   9 +-
 .../regress/test_timeline_detach_ancestor.py  | 246 ++++++++++++++----
 8 files changed, 347 insertions(+), 204 deletions(-)

diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs
index 66422853e1..5fd0eaabc7 100644
--- a/libs/remote_storage/src/error.rs
+++ b/libs/remote_storage/src/error.rs
@@ -42,6 +42,10 @@ impl DownloadError {
             Timeout | Other(_) => false,
         }
     }
+
+    pub fn is_cancelled(&self) -> bool {
+        matches!(self, DownloadError::Cancelled)
+    }
 }
 
 impl From<std::io::Error> for DownloadError {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index fd4ead9d47..d209f4eced 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1900,8 +1900,7 @@ async fn timeline_detach_ancestor_handler(
                         attempt,
                         ctx,
                     )
-                    .await
-                    .map_err(ApiError::InternalServerError)?;
+                    .await?;
 
                 AncestorDetached {
                     reparented_timelines,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 5f2539d426..4e6ea0c8f9 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1929,61 +1929,51 @@ impl TenantManager {
         prepared: PreparedTimelineDetach,
         mut attempt: detach_ancestor::Attempt,
         ctx: &RequestContext,
-    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
-        use crate::tenant::timeline::detach_ancestor::Error;
-        // FIXME: this is unnecessary, slotguard already has these semantics
-        struct RevertOnDropSlot(Option<SlotGuard>);
+    ) -> Result<HashSet<TimelineId>, detach_ancestor::Error> {
+        use detach_ancestor::Error;
 
-        impl Drop for RevertOnDropSlot {
-            fn drop(&mut self) {
-                if let Some(taken) = self.0.take() {
-                    taken.revert();
-                }
-            }
-        }
+        let slot_guard =
+            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist).map_err(
+                |e| {
+                    use TenantSlotError::*;
 
-        impl RevertOnDropSlot {
-            fn into_inner(mut self) -> SlotGuard {
-                self.0.take().unwrap()
-            }
-        }
-
-        impl std::ops::Deref for RevertOnDropSlot {
-            type Target = SlotGuard;
-
-            fn deref(&self) -> &Self::Target {
-                self.0.as_ref().unwrap()
-            }
-        }
-
-        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-        let slot_guard = RevertOnDropSlot(Some(slot_guard));
+                    match e {
+                        MapState(TenantMapError::ShuttingDown) => Error::ShuttingDown,
+                        NotFound(_) | InProgress | MapState(_) => Error::DetachReparent(e.into()),
+                    }
+                },
+            )?;
 
         let tenant = {
-            let Some(old_slot) = slot_guard.get_old_value() else {
-                anyhow::bail!(
-                    "Tenant not found when trying to complete detaching timeline ancestor"
-                );
-            };
+            let old_slot = slot_guard
+                .get_old_value()
+                .as_ref()
+                .expect("requested MustExist");
 
             let Some(tenant) = old_slot.get_attached() else {
-                anyhow::bail!("Tenant is not in attached state");
+                return Err(Error::DetachReparent(anyhow::anyhow!(
+                    "Tenant is not in attached state"
+                )));
             };
 
             if !tenant.is_active() {
-                anyhow::bail!("Tenant is not active");
+                return Err(Error::DetachReparent(anyhow::anyhow!(
+                    "Tenant is not active"
+                )));
             }
 
             tenant.clone()
         };
 
-        let timeline = tenant.get_timeline(timeline_id, true)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, true)
+            .map_err(Error::NotFound)?;
 
         let resp = timeline
             .detach_from_ancestor_and_reparent(&tenant, prepared, ctx)
             .await?;
 
-        let mut slot_guard = slot_guard.into_inner();
+        let mut slot_guard = slot_guard;
 
         let tenant = if resp.reset_tenant_required() {
             attempt.before_reset_tenant();
@@ -1991,17 +1981,20 @@ impl TenantManager {
             let (_guard, progress) = utils::completion::channel();
             match tenant.shutdown(progress, ShutdownMode::Hard).await {
                 Ok(()) => {
-                    slot_guard.drop_old_value()?;
+                    slot_guard.drop_old_value().expect("it was just shutdown");
                 }
                 Err(_barrier) => {
                     slot_guard.revert();
-                    // this really should not happen, at all, unless shutdown was already going?
-                    anyhow::bail!("Cannot restart Tenant, already shutting down");
+                    // this really should not happen, at all, unless a shutdown without acquiring
+                    // tenant slot was already going? regardless, on restart the attempt tracking
+                    // will reset to retryable.
+                    return Err(Error::ShuttingDown);
                 }
             }
 
             let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)
+                .map_err(|e| Error::DetachReparent(e.into()))?;
 
             let shard_identity = config.shard;
             let tenant = tenant_spawn(
@@ -2009,12 +2002,13 @@ impl TenantManager {
                 tenant_shard_id,
                 &tenant_path,
                 self.resources.clone(),
-                AttachedTenantConf::try_from(config)?,
+                AttachedTenantConf::try_from(config).map_err(Error::DetachReparent)?,
                 shard_identity,
                 None,
                 SpawnMode::Eager,
                 ctx,
-            )?;
+            )
+            .map_err(|_| Error::ShuttingDown)?;
 
             {
                 let mut g = tenant.ongoing_timeline_detach.lock().unwrap();
@@ -2025,7 +2019,15 @@ impl TenantManager {
                 *g = Some((attempt.timeline_id, attempt.new_barrier()));
             }
 
-            slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?;
+            // if we bail out here, we will not allow a new attempt, which should be fine.
+            // pageserver should be shutting down regardless? tenant_reset would help, unless it
+            // runs into the same problem.
+            slot_guard
+                .upsert(TenantSlot::Attached(tenant.clone()))
+                .map_err(|e| match e {
+                    TenantSlotUpsertError::ShuttingDown(_) => Error::ShuttingDown,
+                    other => Error::DetachReparent(other.into()),
+                })?;
             tenant
         } else {
             tracing::info!("skipping tenant_reset as no changes made required it");
@@ -2047,7 +2049,7 @@ impl TenantManager {
                         Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => {
                             Error::ShuttingDown
                         }
-                        other => Error::Unexpected(other.into()),
+                        other => Error::Complete(other.into()),
                     }
                 })?;
 
@@ -2057,19 +2059,16 @@ impl TenantManager {
 
             let timeline = tenant
                 .get_timeline(attempt.timeline_id, true)
-                .map_err(|_| Error::DetachedNotFoundAfterRestart)?;
+                .map_err(Error::NotFound)?;
 
             timeline
                 .complete_detaching_timeline_ancestor(&tenant, attempt, ctx)
                 .await
                 .map(|()| reparented)
-                .map_err(|e| e.into())
         } else {
             // at least the latest versions have now been downloaded and refreshed; be ready to
             // retry another time.
-            Err(anyhow::anyhow!(
-                "failed to reparent all candidate timelines, please retry"
-            ))
+            Err(Error::FailedToReparentAll)
         }
     }
 
@@ -2392,6 +2391,9 @@ impl SlotGuard {
 
     /// Get any value that was present in the slot before we acquired ownership
     /// of it: in state transitions, this will be the old state.
+    ///
+    // FIXME: get_ prefix
+    // FIXME: this should be .as_ref() -- unsure why no clippy
     fn get_old_value(&self) -> &Option<TenantSlot> {
         &self.old_value
     }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 0175f32268..9c31d5dc3f 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1612,6 +1612,12 @@ pub(crate) enum DownloadError {
     Failpoint(failpoints::FailpointKind),
 }
 
+impl DownloadError {
+    pub(crate) fn is_cancelled(&self) -> bool {
+        matches!(self, DownloadError::DownloadCancelled)
+    }
+}
+
 #[derive(Debug, PartialEq)]
 pub(crate) enum NeedsDownload {
     NotFound,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f1587951c6..c45d7431ec 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4342,7 +4342,7 @@ impl Timeline {
         tenant: &crate::tenant::Tenant,
         prepared: detach_ancestor::PreparedTimelineDetach,
         ctx: &RequestContext,
-    ) -> Result<detach_ancestor::DetachingAndReparenting, anyhow::Error> {
+    ) -> Result<detach_ancestor::DetachingAndReparenting, detach_ancestor::Error> {
         detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await
     }
 
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 969da2662b..641faada25 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -5,7 +5,6 @@ use crate::{
     context::{DownloadBehavior, RequestContext},
     task_mgr::TaskKind,
     tenant::{
-        mgr::GetActiveTenantError,
         remote_timeline_client::index::GcBlockingReason::DetachAncestor,
         storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer},
         Tenant,
@@ -23,61 +22,74 @@ use utils::{completion, generation::Generation, http::error::ApiError, id::Timel
 pub(crate) enum Error {
     #[error("no ancestors")]
     NoAncestor,
+
     #[error("too many ancestors")]
     TooManyAncestors,
+
     #[error("shutting down, please retry later")]
     ShuttingDown,
-    #[error("flushing failed")]
-    FlushAncestor(#[source] FlushLayerError),
-    #[error("layer download failed")]
-    RewrittenDeltaDownloadFailed(#[source] crate::tenant::storage_layer::layer::DownloadError),
-    #[error("copying LSN prefix locally failed")]
-    CopyDeltaPrefix(#[source] anyhow::Error),
-    #[error("upload rewritten layer")]
-    UploadRewritten(#[source] anyhow::Error),
+
+    #[error(transparent)]
+    NotFound(crate::tenant::GetTimelineError),
+
+    #[error("failed to reparent all candidate timelines, please retry")]
+    FailedToReparentAll,
 
     #[error("ancestor is already being detached by: {}", .0)]
     OtherTimelineDetachOngoing(TimelineId),
 
-    #[error("remote copying layer failed")]
-    CopyFailed(#[source] anyhow::Error),
+    #[error("preparing to timeline ancestor detach failed")]
+    Prepare(#[source] anyhow::Error),
 
-    #[error("wait for tenant to activate after restarting")]
-    WaitToActivate(#[source] GetActiveTenantError),
+    #[error("detaching and reparenting failed")]
+    DetachReparent(#[source] anyhow::Error),
 
-    #[error("detached timeline was not found after restart")]
-    DetachedNotFoundAfterRestart,
-
-    #[error("unexpected error")]
-    Unexpected(#[source] anyhow::Error),
+    #[error("completing ancestor detach failed")]
+    Complete(#[source] anyhow::Error),
 
     #[error("failpoint: {}", .0)]
     Failpoint(&'static str),
 }
 
+impl Error {
+    /// Try to catch cancellation from within the `anyhow::Error`, or wrap the anyhow as the given
+    /// variant or fancier `or_else`.
+    fn launder<F>(e: anyhow::Error, or_else: F) -> Error
+    where
+        F: Fn(anyhow::Error) -> Error,
+    {
+        use crate::tenant::remote_timeline_client::WaitCompletionError;
+        use crate::tenant::upload_queue::NotInitialized;
+        use remote_storage::TimeoutOrCancel;
+
+        if e.is::<NotInitialized>()
+            || TimeoutOrCancel::caused_by_cancel(&e)
+            || e.downcast_ref::<remote_storage::DownloadError>()
+                .is_some_and(|e| e.is_cancelled())
+            || e.is::<WaitCompletionError>()
+        {
+            Error::ShuttingDown
+        } else {
+            or_else(e)
+        }
+    }
+}
+
 impl From<Error> for ApiError {
     fn from(value: Error) -> Self {
         match value {
-            e @ Error::NoAncestor => ApiError::Conflict(e.to_string()),
-            // TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError?
-            e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)),
+            Error::NoAncestor => ApiError::Conflict(value.to_string()),
+            Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", value)),
             Error::ShuttingDown => ApiError::ShuttingDown,
-            Error::OtherTimelineDetachOngoing(_) => {
-                ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
+            Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => {
+                ApiError::ResourceUnavailable(value.to_string().into())
             }
-            e @ Error::WaitToActivate(_) => {
-                let s = utils::error::report_compact_sources(&e).to_string();
-                ApiError::ResourceUnavailable(s.into())
-            }
-            // All of these contain shutdown errors, in fact, it's the most common
-            e @ Error::FlushAncestor(_)
-            | e @ Error::RewrittenDeltaDownloadFailed(_)
-            | e @ Error::CopyDeltaPrefix(_)
-            | e @ Error::UploadRewritten(_)
-            | e @ Error::CopyFailed(_)
-            | e @ Error::Unexpected(_)
-            | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
-            Error::DetachedNotFoundAfterRestart => ApiError::NotFound(value.into()),
+            Error::NotFound(e) => ApiError::from(e),
+            // these variants should have no cancellation errors because of Error::launder
+            Error::Prepare(_)
+            | Error::DetachReparent(_)
+            | Error::Complete(_)
+            | Error::Failpoint(_) => ApiError::InternalServerError(value.into()),
         }
     }
 }
@@ -95,39 +107,6 @@ impl From<super::layer_manager::Shutdown> for Error {
     }
 }
 
-impl From<FlushLayerError> for Error {
-    fn from(value: FlushLayerError) -> Self {
-        match value {
-            FlushLayerError::Cancelled => Error::ShuttingDown,
-            FlushLayerError::NotRunning(_) => {
-                // FIXME(#6424): technically statically unreachable right now, given how we never
-                // drop the sender
-                Error::ShuttingDown
-            }
-            FlushLayerError::CreateImageLayersError(_) | FlushLayerError::Other(_) => {
-                Error::FlushAncestor(value)
-            }
-        }
-    }
-}
-
-impl From<GetActiveTenantError> for Error {
-    fn from(value: GetActiveTenantError) -> Self {
-        use pageserver_api::models::TenantState;
-        use GetActiveTenantError::*;
-
-        match value {
-            Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) | SwitchedTenant => {
-                Error::ShuttingDown
-            }
-            WaitForActiveTimeout { .. } | NotFound(_) | Broken(_) | WillNotBecomeActive(_) => {
-                // NotFound seems out-of-place
-                Error::WaitToActivate(value)
-            }
-        }
-    }
-}
-
 pub(crate) enum Progress {
     Prepared(Attempt, PreparedTimelineDetach),
     Done(AncestorDetached),
@@ -236,7 +215,7 @@ pub(super) async fn prepare(
 
     let attempt = start_new_attempt(detached, tenant).await?;
 
-    utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");
+    utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking-pausable");
 
     fail::fail_point!(
         "timeline-detach-ancestor::before_starting_after_locking",
@@ -265,7 +244,17 @@ pub(super) async fn prepare(
                 }
             };
 
-            res?;
+            res.map_err(|e| {
+                use FlushLayerError::*;
+                match e {
+                    Cancelled | NotRunning(_) => {
+                        // FIXME(#6424): technically statically unreachable right now, given how we never
+                        // drop the sender
+                        Error::ShuttingDown
+                    }
+                    CreateImageLayersError(_) | Other(_) => Error::Prepare(e.into()),
+                }
+            })?;
 
             // we do not need to wait for uploads to complete but we do need `struct Layer`,
             // copying delta prefix is unsupported currently for `InMemoryLayer`.
@@ -346,7 +335,7 @@ pub(super) async fn prepare(
                 }
                 Ok(Ok(None)) => {}
                 Ok(Err(e)) => return Err(e),
-                Err(je) => return Err(Unexpected(je.into())),
+                Err(je) => return Err(Error::Prepare(je.into())),
             }
         }
 
@@ -394,7 +383,7 @@ pub(super) async fn prepare(
             Ok(Err(failed)) => {
                 return Err(failed);
             }
-            Err(je) => return Err(Unexpected(je.into())),
+            Err(je) => return Err(Error::Prepare(je.into())),
         }
     }
 
@@ -416,8 +405,7 @@ async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result<Attem
             crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor,
         )
         .await
-        // FIXME: better error
-        .map_err(Error::Unexpected)?;
+        .map_err(|e| Error::launder(e, Error::Prepare))?;
 
     Ok(attempt)
 }
@@ -546,19 +534,17 @@ async fn upload_rewritten_layer(
     cancel: &CancellationToken,
     ctx: &RequestContext,
 ) -> Result<Option<Layer>, Error> {
-    use Error::UploadRewritten;
     let copied = copy_lsn_prefix(end_lsn, layer, target, ctx).await?;
 
     let Some(copied) = copied else {
         return Ok(None);
     };
 
-    // FIXME: better shuttingdown error
     target
         .remote_client
         .upload_layer_file(&copied, cancel)
         .await
-        .map_err(UploadRewritten)?;
+        .map_err(|e| Error::launder(e, Error::Prepare))?;
 
     Ok(Some(copied.into()))
 }
@@ -569,10 +555,8 @@ async fn copy_lsn_prefix(
     target_timeline: &Arc<Timeline>,
     ctx: &RequestContext,
 ) -> Result<Option<ResidentLayer>, Error> {
-    use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed, ShuttingDown};
-
     if target_timeline.cancel.is_cancelled() {
-        return Err(ShuttingDown);
+        return Err(Error::ShuttingDown);
     }
 
     tracing::debug!(%layer, %end_lsn, "copying lsn prefix");
@@ -586,18 +570,22 @@ async fn copy_lsn_prefix(
         ctx,
     )
     .await
-    .map_err(CopyDeltaPrefix)?;
+    .with_context(|| format!("prepare to copy lsn prefix of ancestors {layer}"))
+    .map_err(Error::Prepare)?;
 
-    let resident = layer
-        .download_and_keep_resident()
-        .await
-        // likely shutdown
-        .map_err(RewrittenDeltaDownloadFailed)?;
+    let resident = layer.download_and_keep_resident().await.map_err(|e| {
+        if e.is_cancelled() {
+            Error::ShuttingDown
+        } else {
+            Error::Prepare(e.into())
+        }
+    })?;
 
     let records = resident
         .copy_delta_prefix(&mut writer, end_lsn, ctx)
         .await
-        .map_err(CopyDeltaPrefix)?;
+        .with_context(|| format!("copy lsn prefix of ancestors {layer}"))
+        .map_err(Error::Prepare)?;
 
     drop(resident);
 
@@ -615,9 +603,9 @@ async fn copy_lsn_prefix(
         let (desc, path) = writer
             .finish(reused_highest_key, ctx)
             .await
-            .map_err(CopyDeltaPrefix)?;
+            .map_err(Error::Prepare)?;
         let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path)
-            .map_err(CopyDeltaPrefix)?;
+            .map_err(Error::Prepare)?;
 
         tracing::debug!(%layer, %copied, "new layer produced");
 
@@ -633,8 +621,6 @@ async fn remote_copy(
     generation: Generation,
     cancel: &CancellationToken,
 ) -> Result<Layer, Error> {
-    use Error::CopyFailed;
-
     // depending if Layer::keep_resident we could hardlink
 
     let mut metadata = adopted.metadata();
@@ -648,13 +634,12 @@ async fn remote_copy(
         metadata,
     );
 
-    // FIXME: better shuttingdown error
     adoptee
         .remote_client
         .copy_timeline_layer(adopted, &owned, cancel)
         .await
         .map(move |()| owned)
-        .map_err(CopyFailed)
+        .map_err(|e| Error::launder(e, Error::Prepare))
 }
 
 pub(crate) enum DetachingAndReparenting {
@@ -698,7 +683,7 @@ pub(super) async fn detach_and_reparent(
     tenant: &Tenant,
     prepared: PreparedTimelineDetach,
     _ctx: &RequestContext,
-) -> Result<DetachingAndReparenting, anyhow::Error> {
+) -> Result<DetachingAndReparenting, Error> {
     let PreparedTimelineDetach { layers } = prepared;
 
     #[derive(Debug)]
@@ -783,7 +768,8 @@ pub(super) async fn detach_and_reparent(
                     (ancestor.timeline_id, ancestor_lsn),
                 )
                 .await
-                .context("publish layers and detach ancestor")?;
+                .context("publish layers and detach ancestor")
+                .map_err(|e| Error::launder(e, Error::DetachReparent))?;
 
             tracing::info!(
                 ancestor=%ancestor.timeline_id,
@@ -927,8 +913,7 @@ pub(super) async fn complete(
             crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor,
         )
         .await
-        // FIXME: better error
-        .map_err(Error::Unexpected)?;
+        .map_err(|e| Error::launder(e, Error::Complete))?;
 
     Ok(())
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ee8e9ac5a1..ef4cd91efd 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3006,8 +3006,13 @@ impl Service {
                         Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
                             ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
                         }
+                        Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, msg) => {
+                            // avoid turning these into conflicts to remain compatible with
+                            // pageservers, 500 errors are sadly retryable with timeline ancestor
+                            // detach
+                            ApiError::InternalServerError(anyhow::anyhow!("{node}: {msg}"))
+                        }
                         // rest can be mapped as usual
-                        // FIXME: this converts some 500 to 409 which is not per openapi
                         other => passthrough_api_error(&node, other),
                     }
                 })
@@ -3041,6 +3046,8 @@ impl Service {
                 ?mismatching,
                 "shards returned different results"
             );
+
+            return Err(ApiError::InternalServerError(anyhow::anyhow!("pageservers returned mixed results for ancestor detach; manual intervention is required.")));
         }
 
         Ok(any.1)
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 902457c2ac..82fc26126d 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -97,7 +97,7 @@ def test_ancestor_detach_branched_from(
         client.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
 
         ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);")
-        wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+        flush_ep_to_pageserver(env, ep, env.initial_tenant, env.initial_timeline)
 
     deltas = client.layer_map_info(env.initial_tenant, env.initial_timeline).delta_layers()
     # there is also the in-mem layer, but ignore it for now
@@ -452,6 +452,9 @@ def test_compaction_induced_by_detaches_in_history(
         }
     )
     env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+    env.pageserver.allowed_errors.append(
+        ".*await_initial_logical_size: can't get semaphore cancel token, skipping"
+    )
     client = env.pageserver.http_client()
 
     def delta_layers(timeline_id: TimelineId):
@@ -524,6 +527,7 @@ def test_compaction_induced_by_detaches_in_history(
     assert len([filter(lambda x: x.l0, delta_layers(branch_timeline_id))]) == 1
 
     skip_main = branches[1:]
+
     branch_lsn = client.timeline_detail(env.initial_tenant, branch_timeline_id)["ancestor_lsn"]
 
     # take the fullbackup before and after inheriting the new L0s
@@ -532,6 +536,13 @@ def test_compaction_induced_by_detaches_in_history(
         env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_before
     )
 
+    # force initial logical sizes, so we can evict all layers from all
+    # timelines and exercise on-demand download for copy lsn prefix
+    client.timeline_detail(
+        env.initial_tenant, env.initial_timeline, force_await_initial_logical_size=True
+    )
+    client.evict_all_layers(env.initial_tenant, env.initial_timeline)
+
     for _, timeline_id in skip_main:
         reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
         assert reparented == set(), "we have no earlier branches at any level"
@@ -705,7 +716,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
     log.info(f"stuck pageserver is id={stuck.id}")
     stuck_http = stuck.http_client()
     stuck_http.configure_failpoints(
-        ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause")
+        ("timeline-detach-ancestor::before_starting_after_locking-pausable", "pause")
     )
 
     restarted = pageservers[int(shards[1]["node_id"])]
@@ -716,7 +727,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
     restarted_http = restarted.http_client()
     restarted_http.configure_failpoints(
         [
-            ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause"),
+            ("timeline-detach-ancestor::before_starting_after_locking-pausable", "pause"),
         ]
     )
 
@@ -734,7 +745,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         target.detach_ancestor(env.initial_tenant, branch_timeline_id, timeout=1)
 
     stuck_http.configure_failpoints(
-        ("timeline-detach-ancestor::before_starting_after_locking_pausable", "off")
+        ("timeline-detach-ancestor::before_starting_after_locking-pausable", "off")
     )
 
     barrier = threading.Barrier(2)
@@ -753,7 +764,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         # we have 10s, lets use 1/2 of that to help the shutdown start
         time.sleep(5)
         restarted_http.configure_failpoints(
-            ("timeline-detach-ancestor::before_starting_after_locking_pausable", "off")
+            ("timeline-detach-ancestor::before_starting_after_locking-pausable", "off")
         )
         fut.result()
 
@@ -806,7 +817,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
     after starting the detach.
 
     What remains not tested by this:
-    - shutdown winning over complete
+    - shutdown winning over complete, see test_timeline_is_deleted_before_timeline_detach_ancestor_completes
     """
 
     if sharded and mode == "delete_tenant":
@@ -833,7 +844,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
 
     detached_timeline = env.neon_cli.create_branch("detached soon", "main")
 
-    pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
+    pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable"
 
     env.storage_controller.reconcile_until_idle()
     shards = env.storage_controller.locate(env.initial_tenant)
@@ -931,7 +942,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
     _, offset = other.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset)
 
 
-@pytest.mark.parametrize("mode", ["delete_reparentable_timeline"])
+@pytest.mark.parametrize("mode", ["delete_reparentable_timeline", "create_reparentable_timeline"])
 def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnvBuilder, mode: str):
     """
     Technically possible storage controller concurrent interleaving timeline
@@ -943,12 +954,6 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
     must be detached.
     """
 
-    assert (
-        mode == "delete_reparentable_timeline"
-    ), "only one now, but creating reparentable timelines cannot be supported even with gc blocking"
-    # perhaps it could be supported by always doing this for the shard0 first, and after that for others.
-    # when we run shard0 to completion, we can use it's timelines to restrict which can be reparented.
-
     shard_count = 2
     neon_env_builder.num_pageservers = shard_count
     env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
@@ -980,14 +985,21 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
     for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]:
         ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline)
 
-    first_branch = env.neon_cli.create_branch(
-        "first_branch", ancestor_branch_name="main", ancestor_start_lsn=first_branch_lsn
-    )
+    def create_reparentable_timeline() -> TimelineId:
+        return env.neon_cli.create_branch(
+            "first_branch", ancestor_branch_name="main", ancestor_start_lsn=first_branch_lsn
+        )
+
+    if mode == "delete_reparentable_timeline":
+        first_branch = create_reparentable_timeline()
+    else:
+        first_branch = None
+
     detached_branch = env.neon_cli.create_branch(
         "detached_branch", ancestor_branch_name="main", ancestor_start_lsn=detached_branch_lsn
     )
 
-    pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
+    pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable"
 
     stuck = pageservers[int(shards[0]["node_id"])]
     stuck_http = stuck.http_client().without_status_retrying()
@@ -999,12 +1011,6 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
         (pausepoint, "pause"),
     )
 
-    # noticed a surprising 409 if the other one would fail instead
-    # victim_http.configure_failpoints([
-    #     (pausepoint, "pause"),
-    #     ("timeline-detach-ancestor::before_starting_after_locking", "return"),
-    # ])
-
     # interleaving a create_timeline which could be reparented will produce two
     # permanently different reparentings: one node has reparented, other has
     # not
@@ -1023,6 +1029,7 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
         assert detail.get("ancestor_lsn") is None
 
     def first_branch_gone():
+        assert first_branch is not None
         try:
             env.storage_controller.pageserver_api().timeline_detail(
                 env.initial_tenant, first_branch
@@ -1043,42 +1050,178 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
             stuck_http.configure_failpoints((pausepoint, "off"))
             wait_until(10, 1.0, first_completed)
 
-            # if we would let victim fail, for some reason there'd be a 409 response instead of 500
-            # victim_http.configure_failpoints((pausepoint, "off"))
-            # with pytest.raises(PageserverApiException, match=".* 500 Internal Server Error failpoint: timeline-detach-ancestor::before_starting_after_locking") as exc:
-            #     fut.result()
-            # assert exc.value.status_code == 409
-
-            env.storage_controller.pageserver_api().timeline_delete(
-                env.initial_tenant, first_branch
-            )
-            victim_http.configure_failpoints((pausepoint, "off"))
-            wait_until(10, 1.0, first_branch_gone)
+            if mode == "delete_reparentable_timeline":
+                assert first_branch is not None
+                env.storage_controller.pageserver_api().timeline_delete(
+                    env.initial_tenant, first_branch
+                )
+                victim_http.configure_failpoints((pausepoint, "off"))
+                wait_until(10, 1.0, first_branch_gone)
+            elif mode == "create_reparentable_timeline":
+                first_branch = create_reparentable_timeline()
+                victim_http.configure_failpoints((pausepoint, "off"))
+            else:
+                raise RuntimeError("{mode}")
 
             # it now passes, and we should get an error messages about mixed reparenting as the stuck still had something to reparent
-            fut.result()
+            mixed_results = "pageservers returned mixed results for ancestor detach; manual intervention is required."
+            with pytest.raises(PageserverApiException, match=mixed_results):
+                fut.result()
 
             msg, offset = env.storage_controller.assert_log_contains(
                 ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*"
             )
-            log.info(f"expected error message: {msg}")
-            env.storage_controller.allowed_errors.append(
-                ".*: shards returned different results matching=0 .*"
+            log.info(f"expected error message: {msg.rstrip()}")
+            env.storage_controller.allowed_errors.extend(
+                [
+                    ".*: shards returned different results matching=0 .*",
+                    f".*: InternalServerError\\({mixed_results}",
+                ]
             )
 
-            detach_timeline()
+            if mode == "create_reparentable_timeline":
+                with pytest.raises(PageserverApiException, match=mixed_results):
+                    detach_timeline()
+            else:
+                # it is a bit shame to flag it and then it suceeds, but most
+                # likely there would be a retry loop which would take care of
+                # this in cplane
+                detach_timeline()
 
-            # FIXME: perhaps the above should be automatically retried, if we get mixed results?
-            not_found = env.storage_controller.log_contains(
+            retried = env.storage_controller.log_contains(
                 ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*",
-                offset=offset,
+                offset,
             )
-
-            assert not_found is None
+            if mode == "delete_reparentable_timeline":
+                assert (
+                    retried is None
+                ), "detaching should had converged after both nodes saw the deletion"
+            elif mode == "create_reparentable_timeline":
+                assert retried is not None, "detaching should not have converged"
+                _, offset = retried
         finally:
             stuck_http.configure_failpoints((pausepoint, "off"))
             victim_http.configure_failpoints((pausepoint, "off"))
 
+    if mode == "create_reparentable_timeline":
+        assert first_branch is not None
+        # now we have mixed ancestry
+        assert (
+            TimelineId(
+                stuck_http.timeline_detail(shards[0]["shard_id"], first_branch)[
+                    "ancestor_timeline_id"
+                ]
+            )
+            == env.initial_timeline
+        )
+        assert (
+            TimelineId(
+                victim_http.timeline_detail(shards[-1]["shard_id"], first_branch)[
+                    "ancestor_timeline_id"
+                ]
+            )
+            == detached_branch
+        )
+
+        # make sure we are still able to repair this by detaching the ancestor on the storage controller in case it ever happens
+        # if the ancestor would be deleted, we would partially fail, making deletion stuck.
+        env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, first_branch)
+
+        # and we should now have good results
+        not_found = env.storage_controller.log_contains(
+            ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*",
+            offset,
+        )
+
+        assert not_found is None
+        assert (
+            stuck_http.timeline_detail(shards[0]["shard_id"], first_branch)["ancestor_timeline_id"]
+            is None
+        )
+        assert (
+            victim_http.timeline_detail(shards[-1]["shard_id"], first_branch)[
+                "ancestor_timeline_id"
+            ]
+            is None
+        )
+
+
+def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor(
+    neon_env_builder: NeonEnvBuilder,
+):
+    shard_count = 2
+    neon_env_builder.num_pageservers = shard_count
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+
+    env.storage_controller.reconcile_until_idle()
+    shards = env.storage_controller.locate(env.initial_tenant)
+    assert len(set(x["node_id"] for x in shards)) == shard_count
+
+    detached_branch = env.neon_cli.create_branch("detached_branch", ancestor_branch_name="main")
+
+    pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable"
+    failpoint = "timeline-detach-ancestor::before_starting_after_locking"
+
+    stuck = pageservers[int(shards[0]["node_id"])]
+    stuck_http = stuck.http_client().without_status_retrying()
+    stuck_http.configure_failpoints(
+        (pausepoint, "pause"),
+    )
+
+    env.storage_controller.allowed_errors.append(
+        f".*Error processing HTTP request: .* failpoint: {failpoint}"
+    )
+    http = env.storage_controller.pageserver_api()
+
+    victim = pageservers[int(shards[-1]["node_id"])]
+    victim.allowed_errors.append(
+        f".*Error processing HTTP request: InternalServerError\\(failpoint: {failpoint}"
+    )
+    victim_http = victim.http_client().without_status_retrying()
+    victim_http.configure_failpoints([(pausepoint, "pause"), (failpoint, "return")])
+
+    def detach_timeline():
+        http.detach_ancestor(env.initial_tenant, detached_branch)
+
+    def paused_at_failpoint():
+        stuck.assert_log_contains(f"at failpoint {pausepoint}")
+        victim.assert_log_contains(f"at failpoint {pausepoint}")
+
+    def first_completed():
+        detail = stuck_http.timeline_detail(shards[0]["shard_id"], detached_branch)
+        log.info(detail)
+        assert detail.get("ancestor_lsn") is None
+
+    with ThreadPoolExecutor(max_workers=1) as pool:
+        try:
+            fut = pool.submit(detach_timeline)
+            wait_until(10, 1.0, paused_at_failpoint)
+
+            # let stuck complete
+            stuck_http.configure_failpoints((pausepoint, "off"))
+            wait_until(10, 1.0, first_completed)
+
+            victim_http.configure_failpoints((pausepoint, "off"))
+
+            with pytest.raises(
+                PageserverApiException,
+                match=f".*failpoint: {failpoint}",
+            ) as exc:
+                fut.result()
+            assert exc.value.status_code == 500
+
+        finally:
+            stuck_http.configure_failpoints((pausepoint, "off"))
+            victim_http.configure_failpoints((pausepoint, "off"))
+
+    victim_http.configure_failpoints((failpoint, "off"))
+    detach_timeline()
+
 
 def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: NeonEnvBuilder):
     """
@@ -1169,7 +1312,7 @@ def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: Neon
             match=".*failed to reparent all candidate timelines, please retry",
         ) as exc:
             http.detach_ancestor(env.initial_tenant, detached)
-        assert exc.value.status_code == 500
+        assert exc.value.status_code == 503
 
     # first round -- do more checking to make sure the gc gets paused
     try_detach()
@@ -1323,14 +1466,11 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes(
 
             http.configure_failpoints((failpoint, "off"))
 
-            with pytest.raises(PageserverApiException) as exc:
+            with pytest.raises(
+                PageserverApiException, match="NotFound: Timeline .* was not found"
+            ) as exc:
                 detach.result()
-
-            # FIXME: this should be 404 but because there is another Anyhow conversion it is 500
-            assert exc.value.status_code == 500
-            env.pageserver.allowed_errors.append(
-                ".*Error processing HTTP request: InternalServerError\\(detached timeline was not found after restart"
-            )
+            assert exc.value.status_code == 404
     finally:
         http.configure_failpoints((failpoint, "off"))
 

From 19d69d515cacc7287517371e3b39fe7f874b306f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 14 Aug 2024 12:10:15 +0100
Subject: [PATCH 35/62] pageserver: evict covered layers earlier (#8679)

## Problem

When pageservers do compaction, they frequently create image layers that
make earlier layers un-needed for reads, but then keep those earlier
layers around for 24 hours waiting for time-based eviction to expire
them.

Now that we track layer visibility, we can use it as an input to
eviction, and avoid the 24 hour "disk bump" that happens around
pageserver restarts.

## Summary of changes

- During time-based eviction, if a layer is marked Covered, use the
eviction period as the threshold: i.e. these layers get to remain
resident for at least one iteration of the eviction loop, but then get
evicted. With current settings this means they get evicted after 1h
instead of 24h.
- During disk usage eviction, prioritized evicting covered layers above
all other layers.


Caveats:
- Using the period as the threshold for time based eviction in this case
is a bit of a hack, but it avoids adding yet another configuration
property, and in any case the value of a new property would be somewhat
arbitrary: there's no "right" length of time to keep covered layers
around just in case.
- We had previously planned on removing time-based eviction: this change
would motivate us to keep it around, but we can still simplify the code
later to just do the eviction of covered layers, rather than applying a
TTL policy to all layers.
---
 pageserver/src/disk_usage_eviction_task.rs    | 54 +++++++++++++------
 pageserver/src/tenant/secondary/downloader.rs |  5 +-
 pageserver/src/tenant/timeline.rs             |  1 +
 .../src/tenant/timeline/eviction_task.rs      | 20 ++++++-
 4 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 92dcf6ee61..5e4a49bc56 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -64,7 +64,7 @@ use crate::{
         mgr::TenantManager,
         remote_timeline_client::LayerFileMetadata,
         secondary::SecondaryTenant,
-        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
+        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint},
     },
     CancellableTask, DiskUsageEvictionTask,
 };
@@ -114,7 +114,7 @@ fn default_highest_layer_count_loses_first() -> bool {
 }
 
 impl EvictionOrder {
-    fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
+    fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
         use EvictionOrder::*;
 
         match self {
@@ -644,6 +644,7 @@ pub(crate) struct EvictionCandidate {
     pub(crate) layer: EvictionLayer,
     pub(crate) last_activity_ts: SystemTime,
     pub(crate) relative_last_activity: finite_f32::FiniteF32,
+    pub(crate) visibility: LayerVisibilityHint,
 }
 
 impl std::fmt::Display for EvictionLayer {
@@ -685,14 +686,22 @@ impl std::fmt::Debug for EvictionCandidate {
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
-enum MinResidentSizePartition {
+enum EvictionPartition {
+    // A layer that is un-wanted by the tenant: evict all these first, before considering
+    // any other layers
+    EvictNow,
+
+    // Above the minimum size threshold: this layer is a candidate for eviction.
     Above,
+
+    // Below the minimum size threshold: this layer should only be evicted if all the
+    // tenants' layers above the minimum size threshold have already been considered.
     Below,
 }
 
 enum EvictionCandidates {
     Cancelled,
-    Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
+    Finished(Vec<(EvictionPartition, EvictionCandidate)>),
 }
 
 /// Gather the eviction candidates.
@@ -890,8 +899,10 @@ async fn collect_eviction_candidates(
             max_layer_size
         };
 
-        // Sort layers most-recently-used first, then partition by
-        // cumsum above/below min_resident_size.
+        // Sort layers most-recently-used first, then calculate [`EvictionPartition`] for each layer,
+        // where the inputs are:
+        //  - whether the layer is visible
+        //  - whether the layer is above/below the min_resident_size cutline
         tenant_candidates
             .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
         let mut cumsum: i128 = 0;
@@ -908,12 +919,23 @@ async fn collect_eviction_candidates(
                     candidate.relative_last_activity =
                         eviction_order.relative_last_activity(total, i);
 
-                    let partition = if cumsum > min_resident_size as i128 {
-                        MinResidentSizePartition::Above
-                    } else {
-                        MinResidentSizePartition::Below
+                    let partition = match candidate.visibility {
+                        LayerVisibilityHint::Covered => {
+                            // Covered layers are evicted first
+                            EvictionPartition::EvictNow
+                        }
+                        LayerVisibilityHint::Visible => {
+                            cumsum += i128::from(candidate.layer.get_file_size());
+
+                            if cumsum > min_resident_size as i128 {
+                                EvictionPartition::Above
+                            } else {
+                                // The most recent layers below the min_resident_size threshold
+                                // are the last to be evicted.
+                                EvictionPartition::Below
+                            }
+                        }
                     };
-                    cumsum += i128::from(candidate.layer.get_file_size());
 
                     (partition, candidate)
                 });
@@ -981,7 +1003,7 @@ async fn collect_eviction_candidates(
                         // Secondary locations' layers are always considered above the min resident size,
                         // i.e. secondary locations are permitted to be trimmed to zero layers if all
                         // the layers have sufficiently old access times.
-                        MinResidentSizePartition::Above,
+                        EvictionPartition::Above,
                         candidate,
                     )
                 });
@@ -1009,7 +1031,9 @@ async fn collect_eviction_candidates(
         }
     }
 
-    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
+    debug_assert!(EvictionPartition::Above < EvictionPartition::Below,
+        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
+    debug_assert!(EvictionPartition::EvictNow < EvictionPartition::Above,
         "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
 
     eviction_order.sort(&mut candidates);
@@ -1022,7 +1046,7 @@ async fn collect_eviction_candidates(
 ///
 /// Returns the amount of candidates selected, with the planned usage.
 fn select_victims<U: Usage>(
-    candidates: &[(MinResidentSizePartition, EvictionCandidate)],
+    candidates: &[(EvictionPartition, EvictionCandidate)],
     usage_pre: U,
 ) -> VictimSelection<U> {
     let mut usage_when_switched = None;
@@ -1034,7 +1058,7 @@ fn select_victims<U: Usage>(
             break;
         }
 
-        if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
+        if partition == &EvictionPartition::Below && usage_when_switched.is_none() {
             usage_when_switched = Some((usage_planned, i));
         }
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 135e73b57f..8cff1d2864 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -22,7 +22,7 @@ use crate::{
             FAILED_REMOTE_OP_RETRIES,
         },
         span::debug_assert_current_span_has_tenant_id,
-        storage_layer::{layer::local_layer_path, LayerName},
+        storage_layer::{layer::local_layer_path, LayerName, LayerVisibilityHint},
         tasks::{warn_when_period_overrun, BackgroundLoopKind},
     },
     virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
@@ -296,6 +296,9 @@ impl SecondaryDetail {
                         }),
                         last_activity_ts: ods.access_time,
                         relative_last_activity: finite_f32::FiniteF32::ZERO,
+                        // Secondary location layers are presumed visible, because Covered layers
+                        // are excluded from the heatmap
+                        visibility: LayerVisibilityHint::Visible,
                     }
                 }));
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c45d7431ec..a799ce764a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5261,6 +5261,7 @@ impl Timeline {
                     layer: layer.to_owned().into(),
                     last_activity_ts,
                     relative_last_activity: finite_f32::FiniteF32::ZERO,
+                    visibility: layer.visibility(),
                 }
             })
             .collect();
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 07d860eb80..eaa9c0ff62 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,7 +30,8 @@ use crate::{
     pgdatadir_mapping::CollectKeySpaceError,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     tenant::{
-        tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
+        storage_layer::LayerVisibilityHint, tasks::BackgroundLoopKind, timeline::EvictionError,
+        LogicalSizeCalculationCause, Tenant,
     },
 };
 
@@ -241,7 +242,22 @@ impl Timeline {
                         }
                     };
 
-                    no_activity_for > p.threshold
+                    match layer.visibility() {
+                        LayerVisibilityHint::Visible => {
+                            // Usual case: a visible layer might be read any time, and we will keep it
+                            // resident until it hits our configured TTL threshold.
+                            no_activity_for > p.threshold
+                        }
+                        LayerVisibilityHint::Covered => {
+                            // Covered layers: this is probably a layer that was recently covered by
+                            // an image layer during compaction.  We don't evict it immediately, but
+                            // it doesn't stay resident for the full `threshold`: we just keep it
+                            // for a shorter time in case
+                            // - it is used for Timestamp->LSN lookups
+                            // - a new branch is created in recent history which will read this layer
+                            no_activity_for > p.period
+                        }
+                    }
                 })
                 .cloned()
                 .for_each(|layer| {

From fc3d372f3ab52ee3d4b9df5fc047c1ab3b5e26b1 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 14 Aug 2024 12:27:52 +0100
Subject: [PATCH 36/62] CI(label-for-external-users): check membership using
 GitHub API (#8724)

## Problem

`author_association` doesn't properly work if a GitHub user decides not
to show affiliation with the org in their profile (i.e. if it's private)

## Summary of changes
- Call
`/orgs/ORG/members/USERNAME` API to check whether
a PR/issue author is a member of the org
---
 .../workflows/label-for-external-users.yml    | 29 +++++++++++++++----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml
index 2f19a746e0..7cf5ee254c 100644
--- a/.github/workflows/label-for-external-users.yml
+++ b/.github/workflows/label-for-external-users.yml
@@ -15,15 +15,34 @@ env:
   LABEL: external
 
 jobs:
+  check-user:
+    runs-on: ubuntu-22.04
+
+    outputs:
+      is-member: ${{ steps.check-user.outputs.is-member }}
+
+    steps:
+    - name: Check whether `${{ github.actor }}` is a member of `${{ github.repository_owner }}`
+      id: check-user
+      env:
+        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      run: |
+        if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
+          is_member=true
+        else
+          is_member=false
+        fi
+
+        echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}
+
   add-label:
-    # This workflow uses `author_association` for PRs and issues to determine if the user is an external user.
-    # Possible values for `author_association`: https://docs.github.com/en/graphql/reference/enums#commentauthorassociation
-    if: ${{ !contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].author_association) }}
+    if: needs.check-user.outputs.is-member == 'false'
+    needs: [ check-user ]
 
     runs-on: ubuntu-22.04
     permissions:
-      pull-requests: write
-      issues: write
+      pull-requests: write # for `gh pr edit`
+      issues: write        # for `gh issue edit`
 
     steps:
     - name: Label new ${{ github.event_name }}

From 6c9e3c95518306c45b5290c10dec5c0a53aaab2d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 14 Aug 2024 14:45:56 +0300
Subject: [PATCH 37/62] refactor: error/anyhow::Error wrapping (#8697)

We can get CompactionError::Other(Cancelled) via the error handling with
a few ways.
[evidence](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8655/10301613380/index.html#suites/cae012a1e6acdd9fdd8b81541972b6ce/653a33de17802bb1/).
Hopefully fix it by:

1. replace the `map_err` which hid the
`GetReadyAncestorError::Cancelled` with `From<GetReadyAncestorError> for
GetVectoredError` conversion
2. simplifying the code in pgdatadir_mapping to eliminate the token
anyhow wrapping for deserialization errors
3. stop wrapping GetVectoredError as anyhow errors
4. stop wrapping PageReconstructError as anyhow errors

Additionally, produce warnings if we treat any other error (as was legal
before this PR) as missing key.

Cc: #8708.
---
 libs/postgres_ffi/src/lib.rs                  |  4 +-
 pageserver/src/http/routes.rs                 |  6 +-
 pageserver/src/pgdatadir_mapping.rs           | 90 +++++++------------
 pageserver/src/tenant.rs                      | 11 ++-
 .../src/tenant/storage_layer/delta_layer.rs   |  4 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  4 +-
 pageserver/src/tenant/timeline.rs             | 71 ++++++++++-----
 pageserver/src/tenant/timeline/compaction.rs  |  2 +-
 pageserver/src/walingest.rs                   |  4 +-
 pageserver/src/walrecord.rs                   |  2 +-
 10 files changed, 101 insertions(+), 97 deletions(-)

diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 729f57f829..0940ad207f 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -143,8 +143,8 @@ pub use v14::xlog_utils::XLogFileName;
 
 pub use v14::bindings::DBState_DB_SHUTDOWNED;
 
-pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
-    dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info)))
+pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool {
+    dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info))
 }
 
 pub fn generate_wal_segment(
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d209f4eced..a4da8506d6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -178,10 +178,8 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
 impl From<PageReconstructError> for ApiError {
     fn from(pre: PageReconstructError) -> ApiError {
         match pre {
-            PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
-            PageReconstructError::MissingKey(e) => {
-                ApiError::InternalServerError(anyhow::anyhow!("{e}"))
-            }
+            PageReconstructError::Other(other) => ApiError::InternalServerError(other),
+            PageReconstructError::MissingKey(e) => ApiError::InternalServerError(e.into()),
             PageReconstructError::Cancelled => ApiError::Cancelled,
             PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
             PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 85f3a6e0fb..4f7eb1a00c 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -287,10 +287,7 @@ impl Timeline {
         // then check if the database was already initialized.
         // get_rel_exists can be called before dbdir is created.
         let buf = version.get(self, DBDIR_KEY, ctx).await?;
-        let dbdirs = match DbDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => Ok(dir.dbdirs),
-            Err(e) => Err(PageReconstructError::from(e)),
-        }?;
+        let dbdirs = DbDirectory::des(&buf)?.dbdirs;
         if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
             return Ok(false);
         }
@@ -298,13 +295,8 @@ impl Timeline {
         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
         let buf = version.get(self, key, ctx).await?;
 
-        match RelDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => {
-                let exists = dir.rels.contains(&(tag.relnode, tag.forknum));
-                Ok(exists)
-            }
-            Err(e) => Err(PageReconstructError::from(e)),
-        }
+        let dir = RelDirectory::des(&buf)?;
+        Ok(dir.rels.contains(&(tag.relnode, tag.forknum)))
     }
 
     /// Get a list of all existing relations in given tablespace and database.
@@ -323,20 +315,16 @@ impl Timeline {
         let key = rel_dir_to_key(spcnode, dbnode);
         let buf = version.get(self, key, ctx).await?;
 
-        match RelDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => {
-                let rels: HashSet<RelTag> =
-                    HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
-                        spcnode,
-                        dbnode,
-                        relnode: *relnode,
-                        forknum: *forknum,
-                    }));
+        let dir = RelDirectory::des(&buf)?;
+        let rels: HashSet<RelTag> =
+            HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
+                spcnode,
+                dbnode,
+                relnode: *relnode,
+                forknum: *forknum,
+            }));
 
-                Ok(rels)
-            }
-            Err(e) => Err(PageReconstructError::from(e)),
-        }
+        Ok(rels)
     }
 
     /// Get the whole SLRU segment
@@ -398,13 +386,8 @@ impl Timeline {
         let key = slru_dir_to_key(kind);
         let buf = version.get(self, key, ctx).await?;
 
-        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => {
-                let exists = dir.segments.contains(&segno);
-                Ok(exists)
-            }
-            Err(e) => Err(PageReconstructError::from(e)),
-        }
+        let dir = SlruSegmentDirectory::des(&buf)?;
+        Ok(dir.segments.contains(&segno))
     }
 
     /// Locate LSN, such that all transactions that committed before
@@ -620,10 +603,7 @@ impl Timeline {
         let key = slru_dir_to_key(kind);
 
         let buf = version.get(self, key, ctx).await?;
-        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => Ok(dir.segments),
-            Err(e) => Err(PageReconstructError::from(e)),
-        }
+        Ok(SlruSegmentDirectory::des(&buf)?.segments)
     }
 
     pub(crate) async fn get_relmap_file(
@@ -647,10 +627,7 @@ impl Timeline {
         // fetch directory entry
         let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
 
-        match DbDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => Ok(dir.dbdirs),
-            Err(e) => Err(PageReconstructError::from(e)),
-        }
+        Ok(DbDirectory::des(&buf)?.dbdirs)
     }
 
     pub(crate) async fn get_twophase_file(
@@ -672,10 +649,7 @@ impl Timeline {
         // fetch directory entry
         let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
 
-        match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => Ok(dir.xids),
-            Err(e) => Err(PageReconstructError::from(e)),
-        }
+        Ok(TwoPhaseDirectory::des(&buf)?.xids)
     }
 
     pub(crate) async fn get_control_file(
@@ -700,10 +674,7 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
         match self.get(AUX_FILES_KEY, lsn, ctx).await {
-            Ok(buf) => match AuxFilesDirectory::des(&buf).context("deserialization failure") {
-                Ok(dir) => Ok(dir.files),
-                Err(e) => Err(PageReconstructError::from(e)),
-            },
+            Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files),
             Err(e) => {
                 // This is expected: historical databases do not have the key.
                 debug!("Failed to get info about AUX files: {}", e);
@@ -719,13 +690,14 @@ impl Timeline {
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
         let kv = self
             .scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
-            .await
-            .context("scan")?;
+            .await?;
         let mut result = HashMap::new();
         let mut sz = 0;
         for (_, v) in kv {
-            let v = v.context("get value")?;
-            let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
+            let v = v?;
+            let v = aux_file::decode_file_value_bytes(&v)
+                .context("value decode")
+                .map_err(PageReconstructError::Other)?;
             for (fname, content) in v {
                 sz += fname.len();
                 sz += content.len();
@@ -793,11 +765,10 @@ impl Timeline {
     ) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
         let kv = self
             .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
-            .await
-            .context("scan")?;
+            .await?;
         let mut result = HashMap::new();
         for (k, v) in kv {
-            let v = v.context("get value")?;
+            let v = v?;
             let origin_id = k.field6 as RepOriginId;
             let origin_lsn = Lsn::des(&v).unwrap();
             if origin_lsn != Lsn::INVALID {
@@ -1733,12 +1704,17 @@ impl<'a> DatadirModification<'a> {
                     // the original code assumes all other errors are missing keys. Therefore, we keep the code path
                     // the same for now, though in theory, we should only match the `MissingKey` variant.
                     Err(
-                        PageReconstructError::Other(_)
+                        e @ (PageReconstructError::Other(_)
                         | PageReconstructError::WalRedo(_)
-                        | PageReconstructError::MissingKey { .. },
+                        | PageReconstructError::MissingKey(_)),
                     ) => {
                         // Key is missing, we must insert an image as the basis for subsequent deltas.
 
+                        if !matches!(e, PageReconstructError::MissingKey(_)) {
+                            let e = utils::error::report_compact_sources(&e);
+                            tracing::warn!("treating error as if it was a missing key: {}", e);
+                        }
+
                         let mut dir = AuxFilesDirectory {
                             files: HashMap::new(),
                         };
@@ -1893,7 +1869,7 @@ impl<'a> DatadirModification<'a> {
                     // work directly with Images, and we never need to read actual
                     // data pages. We could handle this if we had to, by calling
                     // the walredo manager, but let's keep it simple for now.
-                    Err(PageReconstructError::from(anyhow::anyhow!(
+                    Err(PageReconstructError::Other(anyhow::anyhow!(
                         "unexpected pending WAL record"
                     )))
                 };
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index b065f58382..8ab8d08ce1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4491,10 +4491,13 @@ mod tests {
 
         // This needs to traverse to the parent, and fails.
         let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
-        assert!(err.to_string().starts_with(&format!(
-            "Bad state on timeline {}: Broken",
-            tline.timeline_id
-        )));
+        assert!(
+            err.to_string().starts_with(&format!(
+                "bad state on timeline {}: Broken",
+                tline.timeline_id
+            )),
+            "{err}"
+        );
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index f4e965b99a..0ed2f72c3f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1020,7 +1020,7 @@ impl DeltaLayerInner {
                     for (_, blob_meta) in read.blobs_at.as_slice() {
                         reconstruct_state.on_key_error(
                             blob_meta.key,
-                            PageReconstructError::from(anyhow!(
+                            PageReconstructError::Other(anyhow!(
                                 "Failed to read blobs from virtual file {}: {}",
                                 self.file.path,
                                 kind
@@ -1047,7 +1047,7 @@ impl DeltaLayerInner {
                     Err(e) => {
                         reconstruct_state.on_key_error(
                             meta.meta.key,
-                            PageReconstructError::from(anyhow!(e).context(format!(
+                            PageReconstructError::Other(anyhow!(e).context(format!(
                                 "Failed to deserialize blob from virtual file {}",
                                 self.file.path,
                             ))),
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 9c31d5dc3f..774f97e1d9 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -312,7 +312,9 @@ impl Layer {
             .get_or_maybe_download(true, Some(ctx))
             .await
             .map_err(|err| match err {
-                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
+                DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
+                    GetVectoredError::Cancelled
+                }
                 other => GetVectoredError::Other(anyhow::anyhow!(other)),
             })?;
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a799ce764a..d437724673 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -511,7 +511,7 @@ pub(crate) struct TimelineVisitOutcome {
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum PageReconstructError {
     #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    Other(anyhow::Error),
 
     #[error("Ancestor LSN wait error: {0}")]
     AncestorLsnTimeout(WaitLsnError),
@@ -527,6 +527,22 @@ pub(crate) enum PageReconstructError {
     MissingKey(MissingKeyError),
 }
 
+impl From<anyhow::Error> for PageReconstructError {
+    fn from(value: anyhow::Error) -> Self {
+        // with walingest.rs many PageReconstructError are wrapped in as anyhow::Error
+        match value.downcast::<PageReconstructError>() {
+            Ok(pre) => pre,
+            Err(other) => PageReconstructError::Other(other),
+        }
+    }
+}
+
+impl From<utils::bin_ser::DeserializeError> for PageReconstructError {
+    fn from(value: utils::bin_ser::DeserializeError) -> Self {
+        PageReconstructError::Other(anyhow::Error::new(value).context("deserialization failure"))
+    }
+}
+
 impl From<layer_manager::Shutdown> for PageReconstructError {
     fn from(_: layer_manager::Shutdown) -> Self {
         PageReconstructError::Cancelled
@@ -546,6 +562,7 @@ impl From<layer_manager::Shutdown> for GetVectoredError {
     }
 }
 
+#[derive(thiserror::Error)]
 pub struct MissingKeyError {
     key: Key,
     shard: ShardNumber,
@@ -585,11 +602,8 @@ impl PageReconstructError {
     pub(crate) fn is_stopping(&self) -> bool {
         use PageReconstructError::*;
         match self {
-            Other(_) => false,
-            AncestorLsnTimeout(_) => false,
             Cancelled => true,
-            WalRedo(_) => false,
-            MissingKey { .. } => false,
+            Other(_) | AncestorLsnTimeout(_) | WalRedo(_) | MissingKey(_) => false,
         }
     }
 }
@@ -599,11 +613,11 @@ pub(crate) enum CreateImageLayersError {
     #[error("timeline shutting down")]
     Cancelled,
 
-    #[error(transparent)]
-    GetVectoredError(GetVectoredError),
+    #[error("read failed")]
+    GetVectoredError(#[source] GetVectoredError),
 
-    #[error(transparent)]
-    PageReconstructError(PageReconstructError),
+    #[error("reconstruction failed")]
+    PageReconstructError(#[source] PageReconstructError),
 
     #[error(transparent)]
     Other(#[from] anyhow::Error),
@@ -627,10 +641,10 @@ pub(crate) enum FlushLayerError {
 
     // Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush
     // loop via a watch channel, where we can only borrow it.
-    #[error(transparent)]
+    #[error("create image layers (shared)")]
     CreateImageLayersError(Arc<CreateImageLayersError>),
 
-    #[error(transparent)]
+    #[error("other (shared)")]
     Other(#[from] Arc<anyhow::Error>),
 }
 
@@ -663,34 +677,46 @@ pub(crate) enum GetVectoredError {
     #[error("timeline shutting down")]
     Cancelled,
 
-    #[error("Requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)]
+    #[error("requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)]
     Oversized(u64),
 
-    #[error("Requested at invalid LSN: {0}")]
+    #[error("requested at invalid LSN: {0}")]
     InvalidLsn(Lsn),
 
-    #[error("Requested key not found: {0}")]
+    #[error("requested key not found: {0}")]
     MissingKey(MissingKeyError),
 
-    #[error(transparent)]
-    GetReadyAncestorError(GetReadyAncestorError),
+    #[error("ancestry walk")]
+    GetReadyAncestorError(#[source] GetReadyAncestorError),
 
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
 
+impl From<GetReadyAncestorError> for GetVectoredError {
+    fn from(value: GetReadyAncestorError) -> Self {
+        use GetReadyAncestorError::*;
+        match value {
+            Cancelled => GetVectoredError::Cancelled,
+            AncestorLsnTimeout(_) | BadState { .. } => {
+                GetVectoredError::GetReadyAncestorError(value)
+            }
+        }
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GetReadyAncestorError {
-    #[error("Ancestor LSN wait error: {0}")]
+    #[error("ancestor LSN wait error")]
     AncestorLsnTimeout(#[from] WaitLsnError),
 
-    #[error("Bad state on timeline {timeline_id}: {state:?}")]
+    #[error("bad state on timeline {timeline_id}: {state:?}")]
     BadState {
         timeline_id: TimelineId,
         state: TimelineState,
     },
 
-    #[error("Cancelled")]
+    #[error("cancelled")]
     Cancelled,
 }
 
@@ -3046,8 +3072,7 @@ impl Timeline {
             cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
             timeline_owned = timeline
                 .get_ready_ancestor_timeline(ancestor_timeline, ctx)
-                .await
-                .map_err(GetVectoredError::GetReadyAncestorError)?;
+                .await?;
             timeline = &*timeline_owned;
         };
 
@@ -3944,7 +3969,7 @@ impl Timeline {
                                     warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
                                     ZERO_PAGE.clone()
                                 } else {
-                                    return Err(CreateImageLayersError::PageReconstructError(err));
+                                    return Err(CreateImageLayersError::from(err));
                                 }
                             }
                         };
@@ -4004,7 +4029,7 @@ impl Timeline {
             let mut total_kb_retrieved = 0;
             let mut total_keys_retrieved = 0;
             for (k, v) in data {
-                let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
+                let v = v?;
                 total_kb_retrieved += KEY_SIZE + v.len();
                 total_keys_retrieved += 1;
                 new_data.insert(k, v);
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 9ac0086cde..e24459e7b9 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -2325,7 +2325,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
                 key_range,
             ))
         } else {
-            // The current compaction implementatin only ever requests the key space
+            // The current compaction implementation only ever requests the key space
             // at the compaction end LSN.
             anyhow::bail!("keyspace not available for requested lsn");
         }
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 804c7fca97..8425528740 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -515,7 +515,7 @@ impl WalIngest {
             && (decoded.xl_info == pg_constants::XLOG_FPI
                 || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
             // compression of WAL is not yet supported: fall back to storing the original WAL record
-            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)?
+            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)
             // do not materialize null pages because them most likely be soon replaced with real data
             && blk.bimg_len != 0
         {
@@ -1702,7 +1702,7 @@ async fn get_relsize(
     modification: &DatadirModification<'_>,
     rel: RelTag,
     ctx: &RequestContext,
-) -> anyhow::Result<BlockNumber> {
+) -> Result<BlockNumber, PageReconstructError> {
     let nblocks = if !modification
         .tline
         .get_rel_exists(rel, Version::Modified(modification), ctx)
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index 62a3a91b0b..edddcefbe1 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -1018,7 +1018,7 @@ pub fn decode_wal_record(
                     );
 
                     let blk_img_is_compressed =
-                        postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)?;
+                        postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version);
 
                     if blk_img_is_compressed {
                         debug!("compressed block image , pg_version = {}", pg_version);

From a7028d92b7560228e6cf63a3cf102c076bad3aa6 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 14 Aug 2024 13:35:29 +0100
Subject: [PATCH 38/62] proxy: start of jwk cache (#8690)

basic JWT implementation that caches JWKs and verifies signatures.

this code is currently not reachable from proxy, I just wanted to get
something merged in.
---
 Cargo.lock                    | 273 ++++++++++++++++-
 deny.toml                     |   5 +-
 proxy/Cargo.toml              |  11 +-
 proxy/src/auth/backend.rs     |   1 +
 proxy/src/auth/backend/jwt.rs | 556 ++++++++++++++++++++++++++++++++++
 proxy/src/http.rs             |  33 ++
 workspace_hack/Cargo.toml     |  12 +-
 7 files changed, 872 insertions(+), 19 deletions(-)
 create mode 100644 proxy/src/auth/backend/jwt.rs

diff --git a/Cargo.lock b/Cargo.lock
index 031fae0f37..dee15b6aa7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -484,7 +484,7 @@ dependencies = [
  "http 0.2.9",
  "http 1.1.0",
  "once_cell",
- "p256",
+ "p256 0.11.1",
  "percent-encoding",
  "ring 0.17.6",
  "sha2",
@@ -848,6 +848,12 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce"
 
+[[package]]
+name = "base16ct"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf"
+
 [[package]]
 name = "base64"
 version = "0.13.1"
@@ -971,9 +977,9 @@ checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
 
 [[package]]
 name = "bytemuck"
-version = "1.16.0"
+version = "1.16.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
+checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83"
 
 [[package]]
 name = "byteorder"
@@ -1526,8 +1532,10 @@ version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76"
 dependencies = [
+ "generic-array",
  "rand_core 0.6.4",
  "subtle",
+ "zeroize",
 ]
 
 [[package]]
@@ -1621,6 +1629,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c"
 dependencies = [
  "const-oid",
+ "pem-rfc7468",
  "zeroize",
 ]
 
@@ -1720,6 +1729,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
  "block-buffer",
+ "const-oid",
  "crypto-common",
  "subtle",
 ]
@@ -1771,11 +1781,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c"
 dependencies = [
  "der 0.6.1",
- "elliptic-curve",
- "rfc6979",
+ "elliptic-curve 0.12.3",
+ "rfc6979 0.3.1",
  "signature 1.6.4",
 ]
 
+[[package]]
+name = "ecdsa"
+version = "0.16.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca"
+dependencies = [
+ "der 0.7.8",
+ "digest",
+ "elliptic-curve 0.13.8",
+ "rfc6979 0.4.0",
+ "signature 2.2.0",
+ "spki 0.7.3",
+]
+
 [[package]]
 name = "either"
 version = "1.8.1"
@@ -1788,16 +1812,36 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3"
 dependencies = [
- "base16ct",
+ "base16ct 0.1.1",
  "crypto-bigint 0.4.9",
  "der 0.6.1",
  "digest",
- "ff",
+ "ff 0.12.1",
  "generic-array",
- "group",
- "pkcs8",
+ "group 0.12.1",
+ "pkcs8 0.9.0",
  "rand_core 0.6.4",
- "sec1",
+ "sec1 0.3.0",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "elliptic-curve"
+version = "0.13.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47"
+dependencies = [
+ "base16ct 0.2.0",
+ "crypto-bigint 0.5.5",
+ "digest",
+ "ff 0.13.0",
+ "generic-array",
+ "group 0.13.0",
+ "pem-rfc7468",
+ "pkcs8 0.10.2",
+ "rand_core 0.6.4",
+ "sec1 0.7.3",
  "subtle",
  "zeroize",
 ]
@@ -1951,6 +1995,16 @@ dependencies = [
  "subtle",
 ]
 
+[[package]]
+name = "ff"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ded41244b729663b1e574f1b4fb731469f69f79c17667b5d776b16cda0479449"
+dependencies = [
+ "rand_core 0.6.4",
+ "subtle",
+]
+
 [[package]]
 name = "filetime"
 version = "0.2.22"
@@ -2148,6 +2202,7 @@ checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
 dependencies = [
  "typenum",
  "version_check",
+ "zeroize",
 ]
 
 [[package]]
@@ -2214,7 +2269,18 @@ version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7"
 dependencies = [
- "ff",
+ "ff 0.12.1",
+ "rand_core 0.6.4",
+ "subtle",
+]
+
+[[package]]
+name = "group"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63"
+dependencies = [
+ "ff 0.13.0",
  "rand_core 0.6.4",
  "subtle",
 ]
@@ -2776,6 +2842,42 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "jose-b64"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bec69375368709666b21c76965ce67549f2d2db7605f1f8707d17c9656801b56"
+dependencies = [
+ "base64ct",
+ "serde",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "jose-jwa"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ab78e053fe886a351d67cf0d194c000f9d0dcb92906eb34d853d7e758a4b3a7"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "jose-jwk"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "280fa263807fe0782ecb6f2baadc28dffc04e00558a58e33bfdb801d11fd58e7"
+dependencies = [
+ "jose-b64",
+ "jose-jwa",
+ "p256 0.13.2",
+ "p384",
+ "rsa",
+ "serde",
+ "zeroize",
+]
+
 [[package]]
 name = "js-sys"
 version = "0.3.69"
@@ -2835,6 +2937,9 @@ name = "lazy_static"
 version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+dependencies = [
+ "spin 0.5.2",
+]
 
 [[package]]
 name = "lazycell"
@@ -3204,6 +3309,23 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "num-bigint-dig"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151"
+dependencies = [
+ "byteorder",
+ "lazy_static",
+ "libm",
+ "num-integer",
+ "num-iter",
+ "num-traits",
+ "rand 0.8.5",
+ "smallvec",
+ "zeroize",
+]
+
 [[package]]
 name = "num-complex"
 version = "0.4.4"
@@ -3481,11 +3603,33 @@ version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594"
 dependencies = [
- "ecdsa",
- "elliptic-curve",
+ "ecdsa 0.14.8",
+ "elliptic-curve 0.12.3",
  "sha2",
 ]
 
+[[package]]
+name = "p256"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b"
+dependencies = [
+ "ecdsa 0.16.9",
+ "elliptic-curve 0.13.8",
+ "primeorder",
+ "sha2",
+]
+
+[[package]]
+name = "p384"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70786f51bcc69f6a4c0360e063a4cac5419ef7c5cd5b3c99ad70f3be5ba79209"
+dependencies = [
+ "elliptic-curve 0.13.8",
+ "primeorder",
+]
+
 [[package]]
 name = "pagebench"
 version = "0.1.0"
@@ -3847,6 +3991,15 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "pem-rfc7468"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
+dependencies = [
+ "base64ct",
+]
+
 [[package]]
 name = "percent-encoding"
 version = "2.2.0"
@@ -3913,6 +4066,17 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
+[[package]]
+name = "pkcs1"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f"
+dependencies = [
+ "der 0.7.8",
+ "pkcs8 0.10.2",
+ "spki 0.7.3",
+]
+
 [[package]]
 name = "pkcs8"
 version = "0.9.0"
@@ -3923,6 +4087,16 @@ dependencies = [
  "spki 0.6.0",
 ]
 
+[[package]]
+name = "pkcs8"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
+dependencies = [
+ "der 0.7.8",
+ "spki 0.7.3",
+]
+
 [[package]]
 name = "pkg-config"
 version = "0.3.27"
@@ -4116,6 +4290,15 @@ dependencies = [
  "syn 2.0.52",
 ]
 
+[[package]]
+name = "primeorder"
+version = "0.13.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6"
+dependencies = [
+ "elliptic-curve 0.13.8",
+]
+
 [[package]]
 name = "proc-macro-hack"
 version = "0.5.20+deprecated"
@@ -4233,6 +4416,7 @@ version = "0.1.0"
 dependencies = [
  "ahash",
  "anyhow",
+ "arc-swap",
  "async-compression",
  "async-trait",
  "atomic-take",
@@ -4250,6 +4434,7 @@ dependencies = [
  "consumption_metrics",
  "crossbeam-deque",
  "dashmap",
+ "ecdsa 0.16.9",
  "env_logger",
  "fallible-iterator",
  "framed-websockets",
@@ -4270,12 +4455,15 @@ dependencies = [
  "indexmap 2.0.1",
  "ipnet",
  "itertools 0.10.5",
+ "jose-jwa",
+ "jose-jwk",
  "lasso",
  "md5",
  "measured",
  "metrics",
  "once_cell",
  "opentelemetry",
+ "p256 0.13.2",
  "parking_lot 0.12.1",
  "parquet",
  "parquet_derive",
@@ -4296,6 +4484,7 @@ dependencies = [
  "reqwest-retry",
  "reqwest-tracing",
  "routerify",
+ "rsa",
  "rstest",
  "rustc-hash",
  "rustls 0.22.4",
@@ -4305,6 +4494,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
+ "signature 2.2.0",
  "smallvec",
  "smol_str",
  "socket2 0.5.5",
@@ -4807,6 +4997,16 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "rfc6979"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2"
+dependencies = [
+ "hmac",
+ "subtle",
+]
+
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -4867,6 +5067,26 @@ dependencies = [
  "archery",
 ]
 
+[[package]]
+name = "rsa"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc"
+dependencies = [
+ "const-oid",
+ "digest",
+ "num-bigint-dig",
+ "num-integer",
+ "num-traits",
+ "pkcs1",
+ "pkcs8 0.10.2",
+ "rand_core 0.6.4",
+ "signature 2.2.0",
+ "spki 0.7.3",
+ "subtle",
+ "zeroize",
+]
+
 [[package]]
 name = "rstest"
 version = "0.18.2"
@@ -5195,10 +5415,24 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928"
 dependencies = [
- "base16ct",
+ "base16ct 0.1.1",
  "der 0.6.1",
  "generic-array",
- "pkcs8",
+ "pkcs8 0.9.0",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "sec1"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc"
+dependencies = [
+ "base16ct 0.2.0",
+ "der 0.7.8",
+ "generic-array",
+ "pkcs8 0.10.2",
  "subtle",
  "zeroize",
 ]
@@ -5545,6 +5779,7 @@ version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
 dependencies = [
+ "digest",
  "rand_core 0.6.4",
 ]
 
@@ -7379,13 +7614,17 @@ dependencies = [
  "clap",
  "clap_builder",
  "crossbeam-utils",
+ "crypto-bigint 0.5.5",
+ "der 0.7.8",
  "deranged",
+ "digest",
  "either",
  "fail",
  "futures-channel",
  "futures-executor",
  "futures-io",
  "futures-util",
+ "generic-array",
  "getrandom 0.2.11",
  "hashbrown 0.14.5",
  "hex",
@@ -7393,6 +7632,7 @@ dependencies = [
  "hyper 0.14.26",
  "indexmap 1.9.3",
  "itertools 0.10.5",
+ "lazy_static",
  "libc",
  "log",
  "memchr",
@@ -7416,7 +7656,9 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
+ "signature 2.2.0",
  "smallvec",
+ "spki 0.7.3",
  "subtle",
  "syn 1.0.109",
  "syn 2.0.52",
@@ -7527,6 +7769,7 @@ version = "1.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
 dependencies = [
+ "serde",
  "zeroize_derive",
 ]
 
diff --git a/deny.toml b/deny.toml
index dc985138e6..327ac58db7 100644
--- a/deny.toml
+++ b/deny.toml
@@ -22,7 +22,10 @@ feature-depth = 1
 [advisories]
 db-urls = ["https://github.com/rustsec/advisory-db"]
 yanked = "warn"
-ignore = []
+
+[[advisories.ignore]]
+id = "RUSTSEC-2023-0071"
+reason = "the marvin attack only affects private key decryption, not public key signature verification"
 
 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index b316c53034..21d92abb20 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -11,6 +11,7 @@ testing = []
 [dependencies]
 ahash.workspace = true
 anyhow.workspace = true
+arc-swap.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
 atomic-take.workspace = true
@@ -73,7 +74,7 @@ rustls.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-sha2 = { workspace = true, features = ["asm"] }
+sha2 = { workspace = true, features = ["asm", "oid"] }
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
@@ -103,6 +104,14 @@ x509-parser.workspace = true
 postgres-protocol.workspace = true
 redis.workspace = true
 
+# jwt stuff
+jose-jwa = "0.1.2"
+jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] }
+signature = "2"
+ecdsa = "0.16"
+p256 = "0.13"
+rsa = "0.9"
+
 workspace_hack.workspace = true
 
 [dev-dependencies]
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 90dea01bf3..c6a0b2af5a 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -1,5 +1,6 @@
 mod classic;
 mod hacks;
+pub mod jwt;
 mod link;
 
 use std::net::IpAddr;
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
new file mode 100644
index 0000000000..0c2ca8fb97
--- /dev/null
+++ b/proxy/src/auth/backend/jwt.rs
@@ -0,0 +1,556 @@
+use std::{future::Future, sync::Arc, time::Duration};
+
+use anyhow::{bail, ensure, Context};
+use arc_swap::ArcSwapOption;
+use dashmap::DashMap;
+use jose_jwk::crypto::KeyInfo;
+use signature::Verifier;
+use tokio::time::Instant;
+
+use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt};
+
+// TODO(conrad): make these configurable.
+const MIN_RENEW: Duration = Duration::from_secs(30);
+const AUTO_RENEW: Duration = Duration::from_secs(300);
+const MAX_RENEW: Duration = Duration::from_secs(3600);
+const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
+
+/// How to get the JWT auth rules
+pub trait FetchAuthRules: Clone + Send + Sync + 'static {
+    fn fetch_auth_rules(&self) -> impl Future<Output = anyhow::Result<AuthRules>> + Send;
+}
+
+#[derive(Clone)]
+struct FetchAuthRulesFromCplane {
+    #[allow(dead_code)]
+    endpoint: EndpointIdInt,
+}
+
+impl FetchAuthRules for FetchAuthRulesFromCplane {
+    async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
+        Err(anyhow::anyhow!("not yet implemented"))
+    }
+}
+
+pub struct AuthRules {
+    jwks_urls: Vec<url::Url>,
+}
+
+#[derive(Default)]
+pub struct JwkCache {
+    client: reqwest::Client,
+
+    map: DashMap<EndpointIdInt, Arc<JwkCacheEntryLock>>,
+}
+
+pub struct JwkCacheEntryLock {
+    cached: ArcSwapOption<JwkCacheEntry>,
+    lookup: tokio::sync::Semaphore,
+}
+
+impl Default for JwkCacheEntryLock {
+    fn default() -> Self {
+        JwkCacheEntryLock {
+            cached: ArcSwapOption::empty(),
+            lookup: tokio::sync::Semaphore::new(1),
+        }
+    }
+}
+
+pub struct JwkCacheEntry {
+    /// Should refetch at least every hour to verify when old keys have been removed.
+    /// Should refetch when new key IDs are seen only every 5 minutes or so
+    last_retrieved: Instant,
+
+    /// cplane will return multiple JWKs urls that we need to scrape.
+    key_sets: ahash::HashMap<url::Url, jose_jwk::JwkSet>,
+}
+
+impl JwkCacheEntryLock {
+    async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
+        JwkRenewalPermit::acquire_permit(self).await
+    }
+
+    fn try_acquire_permit<'a>(self: &'a Arc<Self>) -> Option<JwkRenewalPermit<'a>> {
+        JwkRenewalPermit::try_acquire_permit(self)
+    }
+
+    async fn renew_jwks<F: FetchAuthRules>(
+        &self,
+        _permit: JwkRenewalPermit<'_>,
+        client: &reqwest::Client,
+        auth_rules: &F,
+    ) -> anyhow::Result<Arc<JwkCacheEntry>> {
+        // double check that no one beat us to updating the cache.
+        let now = Instant::now();
+        let guard = self.cached.load_full();
+        if let Some(cached) = guard {
+            let last_update = now.duration_since(cached.last_retrieved);
+            if last_update < Duration::from_secs(300) {
+                return Ok(cached);
+            }
+        }
+
+        let rules = auth_rules.fetch_auth_rules().await?;
+        let mut key_sets = ahash::HashMap::with_capacity_and_hasher(
+            rules.jwks_urls.len(),
+            ahash::RandomState::new(),
+        );
+        // TODO(conrad): run concurrently
+        // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
+        for url in rules.jwks_urls {
+            let req = client.get(url.clone());
+            // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
+            match req.send().await.and_then(|r| r.error_for_status()) {
+                // todo: should we re-insert JWKs if we want to keep this JWKs URL?
+                // I expect these failures would be quite sparse.
+                Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"),
+                Ok(r) => {
+                    let resp: http::Response<reqwest::Body> = r.into();
+                    match parse_json_body_with_limit::<jose_jwk::JwkSet>(
+                        resp.into_body(),
+                        MAX_JWK_BODY_SIZE,
+                    )
+                    .await
+                    {
+                        Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"),
+                        Ok(jwks) => {
+                            key_sets.insert(url, jwks);
+                        }
+                    }
+                }
+            }
+        }
+
+        let entry = Arc::new(JwkCacheEntry {
+            last_retrieved: now,
+            key_sets,
+        });
+        self.cached.swap(Some(Arc::clone(&entry)));
+
+        Ok(entry)
+    }
+
+    async fn get_or_update_jwk_cache<F: FetchAuthRules>(
+        self: &Arc<Self>,
+        client: &reqwest::Client,
+        fetch: &F,
+    ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
+        let now = Instant::now();
+        let guard = self.cached.load_full();
+
+        // if we have no cached JWKs, try and get some
+        let Some(cached) = guard else {
+            let permit = self.acquire_permit().await;
+            return self.renew_jwks(permit, client, fetch).await;
+        };
+
+        let last_update = now.duration_since(cached.last_retrieved);
+
+        // check if the cached JWKs need updating.
+        if last_update > MAX_RENEW {
+            let permit = self.acquire_permit().await;
+
+            // it's been too long since we checked the keys. wait for them to update.
+            return self.renew_jwks(permit, client, fetch).await;
+        }
+
+        // every 5 minutes we should spawn a job to eagerly update the token.
+        if last_update > AUTO_RENEW {
+            if let Some(permit) = self.try_acquire_permit() {
+                tracing::debug!("JWKs should be renewed. Renewal permit acquired");
+                let permit = permit.into_owned();
+                let entry = self.clone();
+                let client = client.clone();
+                let fetch = fetch.clone();
+                tokio::spawn(async move {
+                    if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await {
+                        tracing::warn!(error=?e, "could not fetch JWKs in background job");
+                    }
+                });
+            } else {
+                tracing::debug!("JWKs should be renewed. Renewal permit already taken, skipping");
+            }
+        }
+
+        Ok(cached)
+    }
+
+    async fn check_jwt<F: FetchAuthRules>(
+        self: &Arc<Self>,
+        jwt: String,
+        client: &reqwest::Client,
+        fetch: &F,
+    ) -> Result<(), anyhow::Error> {
+        // JWT compact form is defined to be
+        // <B64(Header)> || . || <B64(Payload)> || . || <B64(Signature)>
+        // where Signature = alg(<B64(Header)> || . || <B64(Payload)>);
+
+        let (header_payload, signature) = jwt
+            .rsplit_once(".")
+            .context("Provided authentication token is not a valid JWT encoding")?;
+        let (header, _payload) = header_payload
+            .split_once(".")
+            .context("Provided authentication token is not a valid JWT encoding")?;
+
+        let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
+            .context("Provided authentication token is not a valid JWT encoding")?;
+        let header = serde_json::from_slice::<JWTHeader>(&header)
+            .context("Provided authentication token is not a valid JWT encoding")?;
+
+        let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
+            .context("Provided authentication token is not a valid JWT encoding")?;
+
+        ensure!(header.typ == "JWT");
+        let kid = header.kid.context("missing key id")?;
+
+        let mut guard = self.get_or_update_jwk_cache(client, fetch).await?;
+
+        // get the key from the JWKs if possible. If not, wait for the keys to update.
+        let jwk = loop {
+            let jwk = guard
+                .key_sets
+                .values()
+                .flat_map(|jwks| &jwks.keys)
+                .find(|jwk| jwk.prm.kid.as_deref() == Some(kid));
+
+            match jwk {
+                Some(jwk) => break jwk,
+                None if guard.last_retrieved.elapsed() > MIN_RENEW => {
+                    let permit = self.acquire_permit().await;
+                    guard = self.renew_jwks(permit, client, fetch).await?;
+                }
+                _ => {
+                    bail!("jwk not found");
+                }
+            }
+        };
+
+        ensure!(
+            jwk.is_supported(&header.alg),
+            "signature algorithm not supported"
+        );
+
+        match &jwk.key {
+            jose_jwk::Key::Ec(key) => {
+                verify_ec_signature(header_payload.as_bytes(), &sig, key)?;
+            }
+            jose_jwk::Key::Rsa(key) => {
+                verify_rsa_signature(header_payload.as_bytes(), &sig, key, &jwk.prm.alg)?;
+            }
+            key => bail!("unsupported key type {key:?}"),
+        };
+
+        // TODO(conrad): verify iss, exp, nbf, etc...
+
+        Ok(())
+    }
+}
+
+impl JwkCache {
+    pub async fn check_jwt(
+        &self,
+        endpoint: EndpointIdInt,
+        jwt: String,
+    ) -> Result<(), anyhow::Error> {
+        // try with just a read lock first
+        let entry = self.map.get(&endpoint).as_deref().map(Arc::clone);
+        let entry = match entry {
+            Some(entry) => entry,
+            None => {
+                // acquire a write lock after to insert.
+                let entry = self.map.entry(endpoint).or_default();
+                Arc::clone(&*entry)
+            }
+        };
+
+        let fetch = FetchAuthRulesFromCplane { endpoint };
+        entry.check_jwt(jwt, &self.client, &fetch).await
+    }
+}
+
+fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> {
+    use ecdsa::Signature;
+    use signature::Verifier;
+
+    match key.crv {
+        jose_jwk::EcCurves::P256 => {
+            let pk =
+                p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?;
+            let key = p256::ecdsa::VerifyingKey::from(&pk);
+            let sig = Signature::from_slice(sig)?;
+            key.verify(data, &sig)?;
+        }
+        key => bail!("unsupported ec key type {key:?}"),
+    }
+
+    Ok(())
+}
+
+fn verify_rsa_signature(
+    data: &[u8],
+    sig: &[u8],
+    key: &jose_jwk::Rsa,
+    alg: &Option<jose_jwa::Algorithm>,
+) -> anyhow::Result<()> {
+    use jose_jwa::{Algorithm, Signing};
+    use rsa::{
+        pkcs1v15::{Signature, VerifyingKey},
+        RsaPublicKey,
+    };
+
+    let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?;
+
+    match alg {
+        Some(Algorithm::Signing(Signing::Rs256)) => {
+            let key = VerifyingKey::<sha2::Sha256>::new(key);
+            let sig = Signature::try_from(sig)?;
+            key.verify(data, &sig)?;
+        }
+        _ => bail!("invalid RSA signing algorithm"),
+    };
+
+    Ok(())
+}
+
+/// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
+#[derive(serde::Deserialize, serde::Serialize)]
+struct JWTHeader<'a> {
+    /// must be "JWT"
+    typ: &'a str,
+    /// must be a supported alg
+    alg: jose_jwa::Algorithm,
+    /// key id, must be provided for our usecase
+    kid: Option<&'a str>,
+}
+
+struct JwkRenewalPermit<'a> {
+    inner: Option<JwkRenewalPermitInner<'a>>,
+}
+
+enum JwkRenewalPermitInner<'a> {
+    Owned(Arc<JwkCacheEntryLock>),
+    Borrowed(&'a Arc<JwkCacheEntryLock>),
+}
+
+impl JwkRenewalPermit<'_> {
+    fn into_owned(mut self) -> JwkRenewalPermit<'static> {
+        JwkRenewalPermit {
+            inner: self.inner.take().map(JwkRenewalPermitInner::into_owned),
+        }
+    }
+
+    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit {
+        match from.lookup.acquire().await {
+            Ok(permit) => {
+                permit.forget();
+                JwkRenewalPermit {
+                    inner: Some(JwkRenewalPermitInner::Borrowed(from)),
+                }
+            }
+            Err(_) => panic!("semaphore should not be closed"),
+        }
+    }
+
+    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit> {
+        match from.lookup.try_acquire() {
+            Ok(permit) => {
+                permit.forget();
+                Some(JwkRenewalPermit {
+                    inner: Some(JwkRenewalPermitInner::Borrowed(from)),
+                })
+            }
+            Err(tokio::sync::TryAcquireError::NoPermits) => None,
+            Err(tokio::sync::TryAcquireError::Closed) => panic!("semaphore should not be closed"),
+        }
+    }
+}
+
+impl JwkRenewalPermitInner<'_> {
+    fn into_owned(self) -> JwkRenewalPermitInner<'static> {
+        match self {
+            JwkRenewalPermitInner::Owned(p) => JwkRenewalPermitInner::Owned(p),
+            JwkRenewalPermitInner::Borrowed(p) => JwkRenewalPermitInner::Owned(Arc::clone(p)),
+        }
+    }
+}
+
+impl Drop for JwkRenewalPermit<'_> {
+    fn drop(&mut self) {
+        let entry = match &self.inner {
+            None => return,
+            Some(JwkRenewalPermitInner::Owned(p)) => p,
+            Some(JwkRenewalPermitInner::Borrowed(p)) => *p,
+        };
+        entry.lookup.add_permits(1);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
+
+    use base64::URL_SAFE_NO_PAD;
+    use bytes::Bytes;
+    use http::Response;
+    use http_body_util::Full;
+    use hyper1::service::service_fn;
+    use hyper_util::rt::TokioIo;
+    use rand::rngs::OsRng;
+    use signature::Signer;
+    use tokio::net::TcpListener;
+
+    fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) {
+        let sk = p256::SecretKey::random(&mut OsRng);
+        let pk = sk.public_key().into();
+        let jwk = jose_jwk::Jwk {
+            key: jose_jwk::Key::Ec(pk),
+            prm: jose_jwk::Parameters {
+                kid: Some(kid),
+                alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)),
+                ..Default::default()
+            },
+        };
+        (sk, jwk)
+    }
+
+    fn new_rsa_jwk(kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) {
+        let sk = rsa::RsaPrivateKey::new(&mut OsRng, 2048).unwrap();
+        let pk = sk.to_public_key().into();
+        let jwk = jose_jwk::Jwk {
+            key: jose_jwk::Key::Rsa(pk),
+            prm: jose_jwk::Parameters {
+                kid: Some(kid),
+                alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)),
+                ..Default::default()
+            },
+        };
+        (sk, jwk)
+    }
+
+    fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
+        let header = JWTHeader {
+            typ: "JWT",
+            alg: jose_jwa::Algorithm::Signing(sig),
+            kid: Some(&kid),
+        };
+        let body = typed_json::json! {{
+            "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
+        }};
+
+        let header =
+            base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD);
+        let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD);
+
+        format!("{header}.{body}")
+    }
+
+    fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String {
+        use p256::ecdsa::{Signature, SigningKey};
+
+        let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256);
+        let sig: Signature = SigningKey::from(key).sign(payload.as_bytes());
+        let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
+
+        format!("{payload}.{sig}")
+    }
+
+    fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String {
+        use rsa::pkcs1v15::SigningKey;
+        use rsa::signature::SignatureEncoding;
+
+        let payload = build_jwt_payload(kid, jose_jwa::Signing::Rs256);
+        let sig = SigningKey::<sha2::Sha256>::new(key).sign(payload.as_bytes());
+        let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
+
+        format!("{payload}.{sig}")
+    }
+
+    #[tokio::test]
+    async fn renew() {
+        let (rs1, jwk1) = new_rsa_jwk("1".into());
+        let (rs2, jwk2) = new_rsa_jwk("2".into());
+        let (ec1, jwk3) = new_ec_jwk("3".into());
+        let (ec2, jwk4) = new_ec_jwk("4".into());
+
+        let jwt1 = new_rsa_jwt("1".into(), rs1);
+        let jwt2 = new_rsa_jwt("2".into(), rs2);
+        let jwt3 = new_ec_jwt("3".into(), ec1);
+        let jwt4 = new_ec_jwt("4".into(), ec2);
+
+        let foo_jwks = jose_jwk::JwkSet {
+            keys: vec![jwk1, jwk3],
+        };
+        let bar_jwks = jose_jwk::JwkSet {
+            keys: vec![jwk2, jwk4],
+        };
+
+        let service = service_fn(move |req| {
+            let foo_jwks = foo_jwks.clone();
+            let bar_jwks = bar_jwks.clone();
+            async move {
+                let jwks = match req.uri().path() {
+                    "/foo" => &foo_jwks,
+                    "/bar" => &bar_jwks,
+                    _ => {
+                        return Response::builder()
+                            .status(404)
+                            .body(Full::new(Bytes::new()));
+                    }
+                };
+                let body = serde_json::to_vec(jwks).unwrap();
+                Response::builder()
+                    .status(200)
+                    .body(Full::new(Bytes::from(body)))
+            }
+        });
+
+        let listener = TcpListener::bind("0.0.0.0:0").await.unwrap();
+        let server = hyper1::server::conn::http1::Builder::new();
+        let addr = listener.local_addr().unwrap();
+        tokio::spawn(async move {
+            loop {
+                let (s, _) = listener.accept().await.unwrap();
+                let serve = server.serve_connection(TokioIo::new(s), service.clone());
+                tokio::spawn(serve.into_future());
+            }
+        });
+
+        let client = reqwest::Client::new();
+
+        #[derive(Clone)]
+        struct Fetch(SocketAddr);
+
+        impl FetchAuthRules for Fetch {
+            async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
+                Ok(AuthRules {
+                    jwks_urls: vec![
+                        format!("http://{}/foo", self.0).parse().unwrap(),
+                        format!("http://{}/bar", self.0).parse().unwrap(),
+                    ],
+                })
+            }
+        }
+
+        let jwk_cache = Arc::new(JwkCacheEntryLock::default());
+
+        jwk_cache
+            .check_jwt(jwt1, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt2, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt3, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt4, &client, &Fetch(addr))
+            .await
+            .unwrap();
+    }
+}
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index dd7164181d..1f1dd8c415 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -6,6 +6,12 @@ pub mod health_server;
 
 use std::time::Duration;
 
+use anyhow::bail;
+use bytes::Bytes;
+use http_body_util::BodyExt;
+use hyper1::body::Body;
+use serde::de::DeserializeOwned;
+
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
@@ -96,6 +102,33 @@ impl Endpoint {
     }
 }
 
+pub async fn parse_json_body_with_limit<D: DeserializeOwned>(
+    mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
+    limit: usize,
+) -> anyhow::Result<D> {
+    // We could use `b.limited().collect().await.to_bytes()` here
+    // but this ends up being slightly more efficient as far as I can tell.
+
+    // check the lower bound of the size hint.
+    // in reqwest, this value is influenced by the Content-Length header.
+    let lower_bound = match usize::try_from(b.size_hint().lower()) {
+        Ok(bound) if bound <= limit => bound,
+        _ => bail!("Content length exceeds limit of {limit} bytes"),
+    };
+    let mut bytes = Vec::with_capacity(lower_bound);
+
+    while let Some(frame) = b.frame().await.transpose()? {
+        if let Ok(data) = frame.into_data() {
+            if bytes.len() + data.len() > limit {
+                bail!("Content length exceeds limit of {limit} bytes")
+            }
+            bytes.extend_from_slice(&data);
+        }
+    }
+
+    Ok(serde_json::from_slice::<D>(&bytes)?)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 832fe06bf6..2d9b372654 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -30,13 +30,17 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
+crypto-bigint = { version = "0.5", features = ["generic-array", "zeroize"] }
+der = { version = "0.7", default-features = false, features = ["oid", "pem", "std"] }
 deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] }
+digest = { version = "0.10", features = ["mac", "oid", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
+generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
@@ -44,6 +48,7 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
+lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
@@ -64,8 +69,10 @@ rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
-sha2 = { version = "0.10", features = ["asm"] }
+sha2 = { version = "0.10", features = ["asm", "oid"] }
+signature = { version = "2", default-features = false, features = ["digest", "rand_core", "std"] }
 smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
+spki = { version = "0.7", default-features = false, features = ["pem", "std"] }
 subtle = { version = "2" }
 sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
 tikv-jemalloc-sys = { version = "0.5" }
@@ -81,7 +88,7 @@ tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
 url = { version = "2", features = ["serde"] }
 uuid = { version = "1", features = ["serde", "v4", "v7"] }
-zeroize = { version = "1", features = ["derive"] }
+zeroize = { version = "1", features = ["derive", "serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }
@@ -97,6 +104,7 @@ getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
+lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }

From abb53ba36d0cc5da7ead626c3de91d41a255fc68 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 14 Aug 2024 13:37:03 +0100
Subject: [PATCH 39/62] storcon_cli: don't clobber heatmap interval when
 setting eviction (#8722)

## Problem

This command is kind of a hack, used when we're migrating large tenants
and want to get their resident size down. It sets the tenant config to a
fixed value, which omitted heatmap_period, so caused secondaries to get
out of date.

## Summary of changes

- Set heatmap period to the same 300s default that we use elsewhere when
updating eviction settings

This is not as elegant as some general purpose partial modification of
the config, but it practically makes the command safer to use.
---
 control_plane/storcon_cli/src/main.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 5c1add070a..e27491c1c8 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -622,6 +622,7 @@ async fn main() -> anyhow::Result<()> {
                                 threshold: threshold.into(),
                             },
                         )),
+                        heatmap_period: Some("300s".to_string()),
                         ..Default::default()
                     },
                 })

From 36c1719a07a8480f9681dccc6ec6f4b192e3ebbe Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 14 Aug 2024 13:38:25 +0100
Subject: [PATCH 40/62] CI(build-neon): fix accidental neon rebuild on `cargo
 test` (#8721)

## Problem

During `Run rust tests` step (for debug builds), we accidentally rebuild
neon twice (by `cargo test --doc` and by `cargo nextest run`).
It happens because we don't set `cov_prefix` for the `cargo test --doc`
command, which triggers rebuilding with different build flags, and one
more rebuild by `cargo nextest run`.

## Summary of changes
- Set `cov_prefix` for `cargo test --doc` to prevent unneeded rebuilds
---
 .github/workflows/_build-and-test-locally.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index a0ed169024..5e5908860e 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -208,7 +208,7 @@ jobs:
           export LD_LIBRARY_PATH
 
           #nextest does not yet support running doctests
-          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
+          ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
 
           for io_engine in std-fs tokio-epoll-uring ; do
             NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES

From 60fc1e8cc8a906e8b37ee795fe5fca666703fbec Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 14 Aug 2024 16:48:15 +0300
Subject: [PATCH 41/62] chore: even more responsive compaction cancellation
 (#8725)

Some benchmarks and tests might still fail because of #8655 (tracked in
#8708) because we are not fast enough to shut down ([one evidence]).
Partially this is explained by the current validation mode of streaming
k-merge, but otherwise because that is where we use a lot of time in
compaction. Outside of L0 => L1 compaction, the image layer generation
is already guarded by vectored reads doing cancellation checks.

32768 is a wild guess based on looking how many keys we put in each
layer in a bench (1-2 million), but I assume it will be good enough
divisor. Doing checks more often will start showing up as contention
which we cannot currently measure. Doing checks less often might be
reasonable.

[one evidence]:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/10384136483/index.html#suites/9681106e61a1222669b9d22ab136d07b/96e6d53af234924/

Earlier PR: #8706.
---
 pageserver/src/tenant/timeline.rs            |  7 ++++++-
 pageserver/src/tenant/timeline/compaction.rs | 13 +++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d437724673..b4d908b130 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4540,7 +4540,12 @@ impl Timeline {
         new_images: &[ResidentLayer],
         layers_to_remove: &[Layer],
     ) -> Result<(), CompactionError> {
-        let mut guard = self.layers.write().await;
+        let mut guard = tokio::select! {
+            guard = self.layers.write() => guard,
+            _ = self.cancel.cancelled() => {
+                return Err(CompactionError::ShuttingDown);
+            }
+        };
 
         let mut duplicated_layers = HashSet::new();
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index e24459e7b9..7370ec1386 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1048,11 +1048,22 @@ impl Timeline {
         let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
         let mut next_hole = 0; // index of next hole in holes vector
 
+        let mut keys = 0;
+
         while let Some((key, lsn, value)) = all_values_iter
             .next(ctx)
             .await
             .map_err(CompactionError::Other)?
         {
+            keys += 1;
+
+            if keys % 32_768 == 0 && self.cancel.is_cancelled() {
+                // avoid hitting the cancellation token on every key. in benches, we end up
+                // shuffling an order of million keys per layer, this means we'll check it
+                // around tens of times per layer.
+                return Err(CompactionError::ShuttingDown);
+            }
+
             let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
             // We need to check key boundaries once we reach next key or end of layer with the same key
             if !same_key || lsn == dup_end_lsn {
@@ -1157,6 +1168,8 @@ impl Timeline {
                         .await
                         .map_err(CompactionError::Other)?,
                     );
+
+                    keys = 0;
                 }
 
                 writer

From 70b18ff4817658160a34305c7a3f3fa1a21d164e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 14 Aug 2024 17:03:21 +0100
Subject: [PATCH 42/62] CI(neon-image): add ARM-specific RUSTFLAGS (#8566)

## Problem

It's recommended that a couple of additional RUSTFLAGS be set up to
improve the performance of Rust applications on AWS Graviton.

See
https://github.com/aws/aws-graviton-getting-started/blob/57dc813626d0266f1cc12ef83474745bb1f31fb4/rust.md

Note: Apple Silicon is compatible with neoverse-n1:
```
$ clang --version
Apple clang version 15.0.0 (clang-1500.3.9.4)
Target: arm64-apple-darwin23.6.0
Thread model: posix
InstalledDir: /Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin
$
$ clang --print-supported-cpus 2>&1 | grep neoverse-
	neoverse-512tvb
	neoverse-e1
	neoverse-n1
	neoverse-n2
	neoverse-v1
	neoverse-v2
```

## Summary of changes
- Add `-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1` to RUSTFLAGS for
ARM images
---
 .github/workflows/build_and_test.yml | 3 +++
 Dockerfile                           | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 78f9f11a65..a591cb73f2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -503,7 +503,10 @@ jobs:
       - uses: docker/build-push-action@v6
         with:
           context: .
+          # ARM-specific flags are recommended for Graviton ≥ 2, these flags are also supported by Ampere Altra (Azure)
+          # https://github.com/aws/aws-graviton-getting-started/blob/57dc813626d0266f1cc12ef83474745bb1f31fb4/rust.md
           build-args: |
+            ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }}
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
             BUILD_TAG=${{ needs.tag.outputs.build-tag }}
             TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
diff --git a/Dockerfile b/Dockerfile
index ceb1c7cb55..d3d12330c6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -35,8 +35,9 @@ COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_i
 COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --chown=nonroot . .
 
+ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
+    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \

From aa2e16f307c4a55f5ae1ece22a344d3cddc1dccc Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 14 Aug 2024 17:56:59 +0100
Subject: [PATCH 43/62] CI: misc cleanup & fixes (#8559)

## Problem
A bunch of small fixes and improvements for CI, that are too small to
have a separate PR for them

## Summary of changes
- CI(build-and-test): fix parenthesis
- CI(actionlint): fix path to workflow file
- CI: remove default args from actions/checkout
- CI: remove `gen3` label, using a couple `self-hosted` +
`small{,-arm64}`/`large{,-arm64}` is enough
- CI: prettify Slack messages, hide links behind text messages
- C(build-and-test): add more dependencies to `conclusion` job
---
 .github/actionlint.yml                        |  1 -
 .../actions/run-python-test-set/action.yml    |  1 -
 .github/workflows/_build-and-test-locally.yml |  2 -
 .github/workflows/actionlint.yml              |  2 +-
 .github/workflows/benchmarking.yml            | 48 ++++++++++++++-----
 .github/workflows/build-build-tools-image.yml |  2 +-
 .github/workflows/build_and_test.yml          | 40 +++++++---------
 .github/workflows/neon_extra_builds.yml       |  2 -
 .github/workflows/periodic_pagebench.yml      |  2 +-
 9 files changed, 58 insertions(+), 42 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index a5282876d0..4ad8a7b460 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -1,7 +1,6 @@
 self-hosted-runner:
   labels:
     - arm64
-    - gen3
     - large
     - large-arm64
     - small
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 9d39ab6ad7..4ccf190c6a 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -83,7 +83,6 @@ runs:
       uses: actions/checkout@v4
       with:
         submodules: true
-        fetch-depth: 1
 
     - name: Cache poetry deps
       uses: actions/cache@v4
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 5e5908860e..af76e51ebc 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -70,7 +70,6 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 1
 
       - name: Set pg 14 revision for caching
         id: pg_v14_rev
@@ -263,7 +262,6 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 1
 
       - name: Pytest regression tests
         uses: ./.github/actions/run-python-test-set
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index 34fd8b1d15..85cfe7446e 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -44,7 +44,7 @@ jobs:
             grep -ERl $PAT .github/workflows |\
             while read -r f
             do
-              l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
+              l=$(grep -nE $PAT $f | awk -F: '{print $1}' | head -1)
               echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'"
             done
             exit 1
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 106c3e3138..f99a037489 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -96,7 +96,7 @@ jobs:
       uses: aws-actions/configure-aws-credentials@v4
       with:
         aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}  
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         role-duration-seconds: 18000 # 5 hours
 
     - name: Download Neon artifact
@@ -146,6 +146,7 @@ jobs:
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
 
     - name: Create Allure report
+      id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 
@@ -154,7 +155,10 @@ jobs:
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: |
+          Periodic perf testing: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -176,7 +180,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
-    
+
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -215,6 +219,7 @@ jobs:
         NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
 
     - name: Create Allure report
+      id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 
@@ -365,7 +370,7 @@ jobs:
         aws-region: eu-central-1
         role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         role-duration-seconds: 18000 # 5 hours
-        
+
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -460,6 +465,7 @@ jobs:
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
 
     - name: Create Allure report
+      id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 
@@ -468,7 +474,10 @@ jobs:
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: |
+          Periodic perf testing on ${{ matrix.platform }}: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -542,7 +551,7 @@ jobs:
         esac
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        
+
     - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
       uses: aws-actions/configure-aws-credentials@v4
       with:
@@ -577,8 +586,9 @@ jobs:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-    
+
     - name: Create Allure report
+      id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 
@@ -587,7 +597,10 @@ jobs:
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: |
+          Periodic perf testing on ${{ env.PLATFORM }}: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -670,6 +683,7 @@ jobs:
         TEST_OLAP_SCALE: 10
 
     - name: Create Allure report
+      id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 
@@ -678,7 +692,10 @@ jobs:
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: |
+          Periodic OLAP perf testing on ${{ matrix.platform }}: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -764,6 +781,7 @@ jobs:
         TEST_OLAP_SCALE: ${{ matrix.scale }}
 
     - name: Create Allure report
+      id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 
@@ -772,7 +790,10 @@ jobs:
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: |
+          Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -843,6 +864,7 @@ jobs:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
 
     - name: Create Allure report
+      id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 
@@ -851,6 +873,10 @@ jobs:
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: |
+          Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
+
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index f4f6e6971f..ca5ff573e1 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -38,7 +38,7 @@ jobs:
       matrix:
         arch: [ x64, arm64 ]
 
-    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     env:
       IMAGE_TAG: ${{ inputs.image-tag }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a591cb73f2..ee6d3ba005 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -48,7 +48,7 @@ jobs:
 
   tag:
     needs: [ check-permissions ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
     outputs:
       build-tag: ${{steps.build-tag.outputs.tag}}
@@ -90,7 +90,7 @@ jobs:
 
   check-codestyle-python:
     needs: [ check-permissions, build-build-tools-image ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
       credentials:
@@ -101,9 +101,6 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
-        with:
-          submodules: false
-          fetch-depth: 1
 
       - name: Cache poetry deps
         uses: actions/cache@v4
@@ -142,7 +139,6 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 1
 
 #      Disabled for now
 #      - name: Restore cargo deps cache
@@ -204,7 +200,7 @@ jobs:
       matrix:
         arch: [ x64 ]
         # Do not build or run tests in debug for release branches
-        build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
+        build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
         include:
           - build-type: release
             arch: arm64
@@ -224,7 +220,7 @@ jobs:
     outputs:
       json: ${{ steps.get-benchmark-durations.outputs.json }}
     needs: [ check-permissions, build-build-tools-image ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
       credentials:
@@ -257,7 +253,7 @@ jobs:
   benchmarks:
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
     needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
       credentials:
@@ -302,9 +298,8 @@ jobs:
       with:
         channel-id: C060CNA47S9 # on-call-staging-storage-stream
         slack-message: |
-          Benchmarks failed on main: ${{ github.event.head_commit.url }}
-
-          Allure report: ${{ needs.create-test-report.outputs.report-url }}
+          Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}>
+          <${{ needs.create-test-report.outputs.report-url }}|Allure report>
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -314,7 +309,7 @@ jobs:
     outputs:
       report-url: ${{ steps.create-allure-report.outputs.report-url }}
 
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
       credentials:
@@ -361,7 +356,7 @@ jobs:
 
   coverage-report:
     needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
       credentials:
@@ -475,7 +470,7 @@ jobs:
       matrix:
         arch: [ x64, arm64 ]
 
-    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
       - name: Checkout
@@ -554,7 +549,7 @@ jobs:
         version: [ v14, v15, v16 ]
         arch: [ x64, arm64 ]
 
-    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
       - name: Checkout
@@ -699,7 +694,7 @@ jobs:
 
   vm-compute-node-image:
     needs: [ check-permissions, tag, compute-node-image ]
-    runs-on: [ self-hosted, gen3, large ]
+    runs-on: [ self-hosted, large ]
     strategy:
       fail-fast: false
       matrix:
@@ -748,7 +743,7 @@ jobs:
       matrix:
         arch: [ x64, arm64 ]
 
-    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
 
     steps:
       - name: Checkout
@@ -963,7 +958,7 @@ jobs:
     needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
     if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
 
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
     steps:
       - name: Fix git ownership
@@ -983,7 +978,6 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
         with:
-          submodules: false
           fetch-depth: 0
 
       - name: Trigger deploy workflow
@@ -1064,7 +1058,7 @@ jobs:
     needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
     if: github.ref_name == 'release'
 
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
       options: --init
@@ -1120,10 +1114,12 @@ jobs:
     # Format `needs` differently to make the list more readable.
     # Usually we do `needs: [...]`
     needs:
+      - build-and-test-locally
       - check-codestyle-python
       - check-codestyle-rust
-      - build-and-test-locally
+      - promote-images
       - test-images
+      - trigger-custom-extensions-build-and-wait
     runs-on: ubuntu-22.04
     steps:
       # The list of possible results:
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 2ee66cfdc1..7fecdbde8c 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -56,7 +56,6 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 1
 
       - name: Install macOS postgres dependencies
         run: brew install flex bison openssl protobuf icu4c pkg-config
@@ -158,7 +157,6 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 1
 
       # Some of our rust modules use FFI and need those to be checked
       - name: Get postgres headers
diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index ed4e6be712..615937b5a1 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -27,7 +27,7 @@ concurrency:
 
 jobs:
   trigger_bench_on_ec2_machine_in_eu_central_1:
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container:
       image: neondatabase/build-tools:pinned
       credentials:

From 168913bdf0aa9665099b4bba2cf891ce8d48f691 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Aug 2024 21:57:17 +0200
Subject: [PATCH 44/62] refactor(write path): newtype to enforce use of fully
 initialized slices (#8717)

The `tokio_epoll_uring::Slice` / `tokio_uring::Slice` type is weird.
The new `FullSlice` newtype is better. See the doc comment for details.

The naming is not ideal, but we'll clean that up in a future refactoring
where we move the `FullSlice` into `tokio_epoll_uring`. Then, we'll do
the following:
* tokio_epoll_uring::Slice is removed
* `FullSlice` becomes `tokio_epoll_uring::IoBufView`
* new type `tokio_epoll_uring::IoBufMutView` for the current
`tokio_epoll_uring::Slice<IoBufMut>`

Context
-------

I did this work in preparation for
https://github.com/neondatabase/neon/pull/8537.
There, I'm changing the type that the `inmemory_layer.rs` passes to
`DeltaLayerWriter::put_value_bytes` and thus it seemed like a good
opportunity to make this cleanup first.
---
 pageserver/src/tenant/blob_io.rs              |  93 ++++++++------
 .../src/tenant/ephemeral_file/page_caching.rs |  31 ++---
 .../zero_padded_read_write/zero_padded.rs     |   6 +-
 .../tenant/remote_timeline_client/download.rs |   5 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  40 ++++--
 .../src/tenant/storage_layer/image_layer.rs   |   9 +-
 .../tenant/storage_layer/inmemory_layer.rs    |  25 +++-
 pageserver/src/virtual_file.rs                | 120 +++++++++---------
 pageserver/src/virtual_file/io_engine.rs      |  19 ++-
 .../owned_buffers_io/io_buf_ext.rs            |  78 ++++++++++++
 .../virtual_file/owned_buffers_io/slice.rs    |   4 +-
 .../util/size_tracking_writer.rs              |  13 +-
 .../virtual_file/owned_buffers_io/write.rs    |  71 +++++------
 13 files changed, 310 insertions(+), 204 deletions(-)
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 8e9d349ca8..a245c99a88 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -24,6 +24,7 @@ use tracing::warn;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::tenant::block_io::BlockCursor;
+use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};
@@ -186,11 +187,11 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     /// You need to make sure that the internal buffer is empty, otherwise
     /// data will be written in wrong order.
     #[inline(always)]
-    async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    async fn write_all_unbuffered<Buf: IoBuf + Send>(
         &mut self,
-        src_buf: B,
+        src_buf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> (B::Buf, Result<(), Error>) {
+    ) -> (FullSlice<Buf>, Result<(), Error>) {
         let (src_buf, res) = self.inner.write_all(src_buf, ctx).await;
         let nbytes = match res {
             Ok(nbytes) => nbytes,
@@ -204,8 +205,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     /// Flushes the internal buffer to the underlying `VirtualFile`.
     pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> {
         let buf = std::mem::take(&mut self.buf);
-        let (mut buf, res) = self.inner.write_all(buf, ctx).await;
+        let (slice, res) = self.inner.write_all(buf.slice_len(), ctx).await;
         res?;
+        let mut buf = slice.into_raw_slice().into_inner();
         buf.clear();
         self.buf = buf;
         Ok(())
@@ -222,19 +224,30 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     }
 
     /// Internal, possibly buffered, write function
-    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    async fn write_all<Buf: IoBuf + Send>(
         &mut self,
-        src_buf: B,
+        src_buf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> (B::Buf, Result<(), Error>) {
+    ) -> (FullSlice<Buf>, Result<(), Error>) {
+        let src_buf = src_buf.into_raw_slice();
+        let src_buf_bounds = src_buf.bounds();
+        let restore = move |src_buf_slice: Slice<_>| {
+            FullSlice::must_new(Slice::from_buf_bounds(
+                src_buf_slice.into_inner(),
+                src_buf_bounds,
+            ))
+        };
+
         if !BUFFERED {
             assert!(self.buf.is_empty());
-            return self.write_all_unbuffered(src_buf, ctx).await;
+            return self
+                .write_all_unbuffered(FullSlice::must_new(src_buf), ctx)
+                .await;
         }
         let remaining = Self::CAPACITY - self.buf.len();
         let src_buf_len = src_buf.bytes_init();
         if src_buf_len == 0 {
-            return (Slice::into_inner(src_buf.slice_full()), Ok(()));
+            return (restore(src_buf), Ok(()));
         }
         let mut src_buf = src_buf.slice(0..src_buf_len);
         // First try to copy as much as we can into the buffer
@@ -245,7 +258,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         // Then, if the buffer is full, flush it out
         if self.buf.len() == Self::CAPACITY {
             if let Err(e) = self.flush_buffer(ctx).await {
-                return (Slice::into_inner(src_buf), Err(e));
+                return (restore(src_buf), Err(e));
             }
         }
         // Finally, write the tail of src_buf:
@@ -258,27 +271,29 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                 let copied = self.write_into_buffer(&src_buf);
                 // We just verified above that src_buf fits into our internal buffer.
                 assert_eq!(copied, src_buf.len());
-                Slice::into_inner(src_buf)
+                restore(src_buf)
             } else {
-                let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await;
+                let (src_buf, res) = self
+                    .write_all_unbuffered(FullSlice::must_new(src_buf), ctx)
+                    .await;
                 if let Err(e) = res {
                     return (src_buf, Err(e));
                 }
                 src_buf
             }
         } else {
-            Slice::into_inner(src_buf)
+            restore(src_buf)
         };
         (src_buf, Ok(()))
     }
 
     /// Write a blob of data. Returns the offset that it was written to,
     /// which can be used to retrieve the data later.
-    pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    pub async fn write_blob<Buf: IoBuf + Send>(
         &mut self,
-        srcbuf: B,
+        srcbuf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> (B::Buf, Result<u64, Error>) {
+    ) -> (FullSlice<Buf>, Result<u64, Error>) {
         let (buf, res) = self
             .write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
             .await;
@@ -287,43 +302,40 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
 
     /// Write a blob of data. Returns the offset that it was written to,
     /// which can be used to retrieve the data later.
-    pub async fn write_blob_maybe_compressed<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    pub(crate) async fn write_blob_maybe_compressed<Buf: IoBuf + Send>(
         &mut self,
-        srcbuf: B,
+        srcbuf: FullSlice<Buf>,
         ctx: &RequestContext,
         algorithm: ImageCompressionAlgorithm,
-    ) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
+    ) -> (FullSlice<Buf>, Result<(u64, CompressionInfo), Error>) {
         let offset = self.offset;
         let mut compression_info = CompressionInfo {
             written_compressed: false,
             compressed_size: None,
         };
 
-        let len = srcbuf.bytes_init();
+        let len = srcbuf.len();
 
         let mut io_buf = self.io_buf.take().expect("we always put it back below");
         io_buf.clear();
         let mut compressed_buf = None;
-        let ((io_buf, hdr_res), srcbuf) = async {
+        let ((io_buf_slice, hdr_res), srcbuf) = async {
             if len < 128 {
                 // Short blob. Write a 1-byte length header
                 io_buf.put_u8(len as u8);
-                (
-                    self.write_all(io_buf, ctx).await,
-                    srcbuf.slice_full().into_inner(),
-                )
+                (self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
             } else {
                 // Write a 4-byte length header
                 if len > MAX_SUPPORTED_LEN {
                     return (
                         (
-                            io_buf,
+                            io_buf.slice_len(),
                             Err(Error::new(
                                 ErrorKind::Other,
                                 format!("blob too large ({len} bytes)"),
                             )),
                         ),
-                        srcbuf.slice_full().into_inner(),
+                        srcbuf,
                     );
                 }
                 let (high_bit_mask, len_written, srcbuf) = match algorithm {
@@ -336,8 +348,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                         } else {
                             async_compression::tokio::write::ZstdEncoder::new(Vec::new())
                         };
-                        let slice = srcbuf.slice_full();
-                        encoder.write_all(&slice[..]).await.unwrap();
+                        encoder.write_all(&srcbuf[..]).await.unwrap();
                         encoder.shutdown().await.unwrap();
                         let compressed = encoder.into_inner();
                         compression_info.compressed_size = Some(compressed.len());
@@ -345,31 +356,29 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                             compression_info.written_compressed = true;
                             let compressed_len = compressed.len();
                             compressed_buf = Some(compressed);
-                            (BYTE_ZSTD, compressed_len, slice.into_inner())
+                            (BYTE_ZSTD, compressed_len, srcbuf)
                         } else {
-                            (BYTE_UNCOMPRESSED, len, slice.into_inner())
+                            (BYTE_UNCOMPRESSED, len, srcbuf)
                         }
                     }
-                    ImageCompressionAlgorithm::Disabled => {
-                        (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
-                    }
+                    ImageCompressionAlgorithm::Disabled => (BYTE_UNCOMPRESSED, len, srcbuf),
                 };
                 let mut len_buf = (len_written as u32).to_be_bytes();
                 assert_eq!(len_buf[0] & 0xf0, 0);
                 len_buf[0] |= high_bit_mask;
                 io_buf.extend_from_slice(&len_buf[..]);
-                (self.write_all(io_buf, ctx).await, srcbuf)
+                (self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
             }
         }
         .await;
-        self.io_buf = Some(io_buf);
+        self.io_buf = Some(io_buf_slice.into_raw_slice().into_inner());
         match hdr_res {
             Ok(_) => (),
-            Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
+            Err(e) => return (srcbuf, Err(e)),
         }
         let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf {
-            let (_buf, res) = self.write_all(compressed_buf, ctx).await;
-            (Slice::into_inner(srcbuf.slice(..)), res)
+            let (_buf, res) = self.write_all(compressed_buf.slice_len(), ctx).await;
+            (srcbuf, res)
         } else {
             self.write_all(srcbuf, ctx).await
         };
@@ -432,21 +441,21 @@ pub(crate) mod tests {
                 let (_, res) = if compression {
                     let res = wtr
                         .write_blob_maybe_compressed(
-                            blob.clone(),
+                            blob.clone().slice_len(),
                             ctx,
                             ImageCompressionAlgorithm::Zstd { level: Some(1) },
                         )
                         .await;
                     (res.0, res.1.map(|(off, _)| off))
                 } else {
-                    wtr.write_blob(blob.clone(), ctx).await
+                    wtr.write_blob(blob.clone().slice_len(), ctx).await
                 };
                 let offs = res?;
                 offsets.push(offs);
             }
             // Write out one page worth of zeros so that we can
             // read again with read_blk
-            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ].slice_len(), ctx).await;
             let offs = res?;
             println!("Writing final blob at offs={offs}");
             wtr.flush_buffer(ctx).await?;
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 0a12b64a7c..7355b3b5a3 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -4,6 +4,7 @@
 use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::BlockLease;
+use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
 use crate::virtual_file::VirtualFile;
 
 use once_cell::sync::Lazy;
@@ -208,21 +209,11 @@ impl PreWarmingWriter {
 }
 
 impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
-    async fn write_all<
-        B: tokio_epoll_uring::BoundedBuf<Buf = Buf>,
-        Buf: tokio_epoll_uring::IoBuf + Send,
-    >(
+    async fn write_all<Buf: tokio_epoll_uring::IoBuf + Send>(
         &mut self,
-        buf: B,
+        buf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, B::Buf)> {
-        let buf = buf.slice(..);
-        let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done
-        let check_bounds_stuff_works = if cfg!(test) && cfg!(debug_assertions) {
-            Some(buf.to_vec())
-        } else {
-            None
-        };
+    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
         let buflen = buf.len();
         assert_eq!(
             buflen % PAGE_SZ,
@@ -231,10 +222,10 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
         );
 
         // Do the IO.
-        let iobuf = match self.file.write_all(buf, ctx).await {
-            (iobuf, Ok(nwritten)) => {
+        let buf = match self.file.write_all(buf, ctx).await {
+            (buf, Ok(nwritten)) => {
                 assert_eq!(nwritten, buflen);
-                iobuf
+                buf
             }
             (_, Err(e)) => {
                 return Err(std::io::Error::new(
@@ -248,12 +239,6 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
             }
         };
 
-        // Reconstruct the Slice (the write path consumed the Slice and returned us the underlying IoBuf)
-        let buf = tokio_epoll_uring::Slice::from_buf_bounds(iobuf, saved_bounds);
-        if let Some(check_bounds_stuff_works) = check_bounds_stuff_works {
-            assert_eq!(&check_bounds_stuff_works, &*buf);
-        }
-
         let nblocks = buflen / PAGE_SZ;
         let nblocks32 = u32::try_from(nblocks).unwrap();
 
@@ -300,6 +285,6 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
         }
 
         self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
-        Ok((buflen, buf.into_inner()))
+        Ok((buflen, buf))
     }
 }
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
index f90291bbf8..2dc0277638 100644
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
@@ -5,6 +5,8 @@
 
 use std::mem::MaybeUninit;
 
+use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
+
 /// See module-level comment.
 pub struct Buffer<const N: usize> {
     allocation: Box<[u8; N]>,
@@ -60,10 +62,10 @@ impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Bu
         self.written
     }
 
-    fn flush(self) -> tokio_epoll_uring::Slice<Self> {
+    fn flush(self) -> FullSlice<Self> {
         self.invariants();
         let written = self.written;
-        tokio_epoll_uring::BoundedBuf::slice(self, 0..written)
+        FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written))
     }
 
     fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index a17b32c983..8199218c3c 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -23,6 +23,7 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
+use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
@@ -219,9 +220,7 @@ async fn download_object<'a>(
                             Ok(chunk) => chunk,
                             Err(e) => return Err(e),
                         };
-                        buffered
-                            .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk), ctx)
-                            .await?;
+                        buffered.write_buffered(chunk.slice_len(), ctx).await?;
                     }
                     let size_tracking = buffered.flush_and_into_inner(ctx).await?;
                     Ok(size_tracking.into_inner())
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 0ed2f72c3f..6c2391d72d 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -42,6 +42,7 @@ use crate::tenant::vectored_blob_io::{
     VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
+use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
@@ -63,6 +64,7 @@ use std::os::unix::fs::FileExt;
 use std::str::FromStr;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
+use tokio_epoll_uring::IoBufMut;
 use tracing::*;
 
 use utils::{
@@ -436,19 +438,28 @@ impl DeltaLayerWriterInner {
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let (_, res) = self
-            .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init(), ctx)
+            .put_value_bytes(
+                key,
+                lsn,
+                Value::ser(&val)?.slice_len(),
+                val.will_init(),
+                ctx,
+            )
             .await;
         res
     }
 
-    async fn put_value_bytes(
+    async fn put_value_bytes<Buf>(
         &mut self,
         key: Key,
         lsn: Lsn,
-        val: Vec<u8>,
+        val: FullSlice<Buf>,
         will_init: bool,
         ctx: &RequestContext,
-    ) -> (Vec<u8>, anyhow::Result<()>) {
+    ) -> (FullSlice<Buf>, anyhow::Result<()>)
+    where
+        Buf: IoBufMut + Send,
+    {
         assert!(
             self.lsn_range.start <= lsn,
             "lsn_start={}, lsn={}",
@@ -514,7 +525,7 @@ impl DeltaLayerWriterInner {
         file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
             .await?;
         for buf in block_buf.blocks {
-            let (_buf, res) = file.write_all(buf, ctx).await;
+            let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
             res?;
         }
         assert!(self.lsn_range.start < self.lsn_range.end);
@@ -534,7 +545,7 @@ impl DeltaLayerWriterInner {
         // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&summary, &mut buf)?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf, ctx).await;
+        let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
         res?;
 
         let metadata = file
@@ -646,14 +657,17 @@ impl DeltaLayerWriter {
             .await
     }
 
-    pub async fn put_value_bytes(
+    pub async fn put_value_bytes<Buf>(
         &mut self,
         key: Key,
         lsn: Lsn,
-        val: Vec<u8>,
+        val: FullSlice<Buf>,
         will_init: bool,
         ctx: &RequestContext,
-    ) -> (Vec<u8>, anyhow::Result<()>) {
+    ) -> (FullSlice<Buf>, anyhow::Result<()>)
+    where
+        Buf: IoBufMut + Send,
+    {
         self.inner
             .as_mut()
             .unwrap()
@@ -743,7 +757,7 @@ impl DeltaLayer {
         // TODO: could use smallvec here, but it's a pain with Slice<T>
         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf, ctx).await;
+        let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
         res?;
         Ok(())
     }
@@ -1291,12 +1305,12 @@ impl DeltaLayerInner {
                         .put_value_bytes(
                             key,
                             lsn,
-                            std::mem::take(&mut per_blob_copy),
+                            std::mem::take(&mut per_blob_copy).slice_len(),
                             will_init,
                             ctx,
                         )
                         .await;
-                    per_blob_copy = tmp;
+                    per_blob_copy = tmp.into_raw_slice().into_inner();
 
                     res?;
 
@@ -1871,7 +1885,7 @@ pub(crate) mod test {
 
         for entry in entries {
             let (_, res) = writer
-                .put_value_bytes(entry.key, entry.lsn, entry.value, false, &ctx)
+                .put_value_bytes(entry.key, entry.lsn, entry.value.slice_len(), false, &ctx)
                 .await;
             res?;
         }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index f9d3fdf186..9a19e4e2c7 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -38,6 +38,7 @@ use crate::tenant::vectored_blob_io::{
     VectoredReadPlanner,
 };
 use crate::tenant::{PageReconstructError, Timeline};
+use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
@@ -354,7 +355,7 @@ impl ImageLayer {
         // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf, ctx).await;
+        let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
         res?;
         Ok(())
     }
@@ -786,7 +787,7 @@ impl ImageLayerWriterInner {
         self.num_keys += 1;
         let (_img, res) = self
             .blob_writer
-            .write_blob_maybe_compressed(img, ctx, compression)
+            .write_blob_maybe_compressed(img.slice_len(), ctx, compression)
             .await;
         // TODO: re-use the buffer for `img` further upstack
         let (off, compression_info) = res?;
@@ -838,7 +839,7 @@ impl ImageLayerWriterInner {
             .await?;
         let (index_root_blk, block_buf) = self.tree.finish()?;
         for buf in block_buf.blocks {
-            let (_buf, res) = file.write_all(buf, ctx).await;
+            let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
             res?;
         }
 
@@ -858,7 +859,7 @@ impl ImageLayerWriterInner {
         // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&summary, &mut buf)?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf, ctx).await;
+        let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
         res?;
 
         let metadata = file
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index fb15ddfba9..748d79c149 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -12,6 +12,7 @@ use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
+use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache, walrecord};
 use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
@@ -581,11 +582,17 @@ impl InMemoryLayer {
                     for (lsn, pos) in vec_map.as_slice() {
                         cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
                         let will_init = Value::des(&buf)?.will_init();
-                        let res;
-                        (buf, res) = delta_layer_writer
-                            .put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, &ctx)
+                        let (tmp, res) = delta_layer_writer
+                            .put_value_bytes(
+                                Key::from_compact(*key),
+                                *lsn,
+                                buf.slice_len(),
+                                will_init,
+                                &ctx,
+                            )
                             .await;
                         res?;
+                        buf = tmp.into_raw_slice().into_inner();
                     }
                 }
             }
@@ -620,11 +627,17 @@ impl InMemoryLayer {
                         // => https://github.com/neondatabase/neon/issues/8183
                         cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
                         let will_init = Value::des(&buf)?.will_init();
-                        let res;
-                        (buf, res) = delta_layer_writer
-                            .put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, ctx)
+                        let (tmp, res) = delta_layer_writer
+                            .put_value_bytes(
+                                Key::from_compact(*key),
+                                *lsn,
+                                buf.slice_len(),
+                                will_init,
+                                ctx,
+                            )
                             .await;
                         res?;
+                        buf = tmp.into_raw_slice().into_inner();
                     }
                 }
             }
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 27f6fe90a4..b4695e5f40 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -17,6 +17,7 @@ use crate::page_cache::{PageWriteGuard, PAGE_SZ};
 use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
+use owned_buffers_io::io_buf_ext::FullSlice;
 use pageserver_api::shard::TenantShardId;
 use std::fs::File;
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
@@ -50,6 +51,7 @@ pub(crate) mod owned_buffers_io {
     //! but for the time being we're proving out the primitives in the neon.git repo
     //! for faster iteration.
 
+    pub(crate) mod io_buf_ext;
     pub(crate) mod slice;
     pub(crate) mod write;
     pub(crate) mod util {
@@ -637,24 +639,24 @@ impl VirtualFile {
     }
 
     // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
-    pub async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    pub async fn write_all_at<Buf: IoBuf + Send>(
         &self,
-        buf: B,
+        buf: FullSlice<Buf>,
         mut offset: u64,
         ctx: &RequestContext,
-    ) -> (B::Buf, Result<(), Error>) {
-        let buf_len = buf.bytes_init();
-        if buf_len == 0 {
-            return (Slice::into_inner(buf.slice_full()), Ok(()));
-        }
-        let mut buf = buf.slice(0..buf_len);
+    ) -> (FullSlice<Buf>, Result<(), Error>) {
+        let buf = buf.into_raw_slice();
+        let bounds = buf.bounds();
+        let restore =
+            |buf: Slice<_>| FullSlice::must_new(Slice::from_buf_bounds(buf.into_inner(), bounds));
+        let mut buf = buf;
         while !buf.is_empty() {
-            let res;
-            (buf, res) = self.write_at(buf, offset, ctx).await;
+            let (tmp, res) = self.write_at(FullSlice::must_new(buf), offset, ctx).await;
+            buf = tmp.into_raw_slice();
             match res {
                 Ok(0) => {
                     return (
-                        Slice::into_inner(buf),
+                        restore(buf),
                         Err(Error::new(
                             std::io::ErrorKind::WriteZero,
                             "failed to write whole buffer",
@@ -666,33 +668,33 @@ impl VirtualFile {
                     offset += n as u64;
                 }
                 Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return (Slice::into_inner(buf), Err(e)),
+                Err(e) => return (restore(buf), Err(e)),
             }
         }
-        (Slice::into_inner(buf), Ok(()))
+        (restore(buf), Ok(()))
     }
 
-    /// Writes `buf.slice(0..buf.bytes_init())`.
-    /// Returns the IoBuf that is underlying the BoundedBuf `buf`.
-    /// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in.
-    /// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant.
-    pub async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    /// Writes `buf` to the file at the current offset.
+    ///
+    /// Panics if there is an uninitialized range in `buf`, as that is most likely a bug in the caller.
+    pub async fn write_all<Buf: IoBuf + Send>(
         &mut self,
-        buf: B,
+        buf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> (B::Buf, Result<usize, Error>) {
-        let nbytes = buf.bytes_init();
-        if nbytes == 0 {
-            return (Slice::into_inner(buf.slice_full()), Ok(0));
-        }
-        let mut buf = buf.slice(0..nbytes);
+    ) -> (FullSlice<Buf>, Result<usize, Error>) {
+        let buf = buf.into_raw_slice();
+        let bounds = buf.bounds();
+        let restore =
+            |buf: Slice<_>| FullSlice::must_new(Slice::from_buf_bounds(buf.into_inner(), bounds));
+        let nbytes = buf.len();
+        let mut buf = buf;
         while !buf.is_empty() {
-            let res;
-            (buf, res) = self.write(buf, ctx).await;
+            let (tmp, res) = self.write(FullSlice::must_new(buf), ctx).await;
+            buf = tmp.into_raw_slice();
             match res {
                 Ok(0) => {
                     return (
-                        Slice::into_inner(buf),
+                        restore(buf),
                         Err(Error::new(
                             std::io::ErrorKind::WriteZero,
                             "failed to write whole buffer",
@@ -703,17 +705,17 @@ impl VirtualFile {
                     buf = buf.slice(n..);
                 }
                 Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return (Slice::into_inner(buf), Err(e)),
+                Err(e) => return (restore(buf), Err(e)),
             }
         }
-        (Slice::into_inner(buf), Ok(nbytes))
+        (restore(buf), Ok(nbytes))
     }
 
     async fn write<B: IoBuf + Send>(
         &mut self,
-        buf: Slice<B>,
+        buf: FullSlice<B>,
         ctx: &RequestContext,
-    ) -> (Slice<B>, Result<usize, std::io::Error>) {
+    ) -> (FullSlice<B>, Result<usize, std::io::Error>) {
         let pos = self.pos;
         let (buf, res) = self.write_at(buf, pos, ctx).await;
         let n = match res {
@@ -756,10 +758,10 @@ impl VirtualFile {
 
     async fn write_at<B: IoBuf + Send>(
         &self,
-        buf: Slice<B>,
+        buf: FullSlice<B>,
         offset: u64,
         _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
-    ) -> (Slice<B>, Result<usize, Error>) {
+    ) -> (FullSlice<B>, Result<usize, Error>) {
         let file_guard = match self.lock_file().await {
             Ok(file_guard) => file_guard,
             Err(e) => return (buf, Err(e)),
@@ -1093,11 +1095,11 @@ impl Drop for VirtualFile {
 
 impl OwnedAsyncWriter for VirtualFile {
     #[inline(always)]
-    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    async fn write_all<Buf: IoBuf + Send>(
         &mut self,
-        buf: B,
+        buf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, B::Buf)> {
+    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
         let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
         res.map(move |v| (v, buf))
     }
@@ -1159,7 +1161,8 @@ mod tests {
     use crate::task_mgr::TaskKind;
 
     use super::*;
-    use owned_buffers_io::slice::SliceExt;
+    use owned_buffers_io::io_buf_ext::IoBufExt;
+    use owned_buffers_io::slice::SliceMutExt;
     use rand::seq::SliceRandom;
     use rand::thread_rng;
     use rand::Rng;
@@ -1193,9 +1196,9 @@ mod tests {
                 }
             }
         }
-        async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        async fn write_all_at<Buf: IoBuf + Send>(
             &self,
-            buf: B,
+            buf: FullSlice<Buf>,
             offset: u64,
             ctx: &RequestContext,
         ) -> Result<(), Error> {
@@ -1204,13 +1207,7 @@ mod tests {
                     let (_buf, res) = file.write_all_at(buf, offset, ctx).await;
                     res
                 }
-                MaybeVirtualFile::File(file) => {
-                    let buf_len = buf.bytes_init();
-                    if buf_len == 0 {
-                        return Ok(());
-                    }
-                    file.write_all_at(&buf.slice(0..buf_len), offset)
-                }
+                MaybeVirtualFile::File(file) => file.write_all_at(&buf[..], offset),
             }
         }
         async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
@@ -1219,9 +1216,9 @@ mod tests {
                 MaybeVirtualFile::File(file) => file.seek(pos),
             }
         }
-        async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        async fn write_all<Buf: IoBuf + Send>(
             &mut self,
-            buf: B,
+            buf: FullSlice<Buf>,
             ctx: &RequestContext,
         ) -> Result<(), Error> {
             match self {
@@ -1229,13 +1226,7 @@ mod tests {
                     let (_buf, res) = file.write_all(buf, ctx).await;
                     res.map(|_| ())
                 }
-                MaybeVirtualFile::File(file) => {
-                    let buf_len = buf.bytes_init();
-                    if buf_len == 0 {
-                        return Ok(());
-                    }
-                    file.write_all(&buf.slice(0..buf_len))
-                }
+                MaybeVirtualFile::File(file) => file.write_all(&buf[..]),
             }
         }
 
@@ -1347,7 +1338,9 @@ mod tests {
             &ctx,
         )
         .await?;
-        file_a.write_all(b"foobar".to_vec(), &ctx).await?;
+        file_a
+            .write_all(b"foobar".to_vec().slice_len(), &ctx)
+            .await?;
 
         // cannot read from a file opened in write-only mode
         let _ = file_a.read_string(&ctx).await.unwrap_err();
@@ -1356,7 +1349,10 @@ mod tests {
         let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?;
 
         // cannot write to a file opened in read-only mode
-        let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();
+        let _ = file_a
+            .write_all(b"bar".to_vec().slice_len(), &ctx)
+            .await
+            .unwrap_err();
 
         // Try simple read
         assert_eq!("foobar", file_a.read_string(&ctx).await?);
@@ -1399,8 +1395,12 @@ mod tests {
             &ctx,
         )
         .await?;
-        file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?;
-        file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?;
+        file_b
+            .write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx)
+            .await?;
+        file_b
+            .write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx)
+            .await?;
 
         assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");
 
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 0ffcd9fa05..faef1ba9ff 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -12,7 +12,7 @@
 #[cfg(target_os = "linux")]
 pub(super) mod tokio_epoll_uring_ext;
 
-use tokio_epoll_uring::{IoBuf, Slice};
+use tokio_epoll_uring::IoBuf;
 use tracing::Instrument;
 
 pub(crate) use super::api::IoEngineKind;
@@ -107,7 +107,10 @@ use std::{
     sync::atomic::{AtomicU8, Ordering},
 };
 
-use super::{owned_buffers_io::slice::SliceExt, FileGuard, Metadata};
+use super::{
+    owned_buffers_io::{io_buf_ext::FullSlice, slice::SliceMutExt},
+    FileGuard, Metadata,
+};
 
 #[cfg(target_os = "linux")]
 fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
@@ -206,8 +209,8 @@ impl IoEngine {
         &self,
         file_guard: FileGuard,
         offset: u64,
-        buf: Slice<B>,
-    ) -> ((FileGuard, Slice<B>), std::io::Result<usize>) {
+        buf: FullSlice<B>,
+    ) -> ((FileGuard, FullSlice<B>), std::io::Result<usize>) {
         match self {
             IoEngine::NotSet => panic!("not initialized"),
             IoEngine::StdFs => {
@@ -217,8 +220,12 @@ impl IoEngine {
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring_ext::thread_local_system().await;
-                let (resources, res) = system.write(file_guard, offset, buf).await;
-                (resources, res.map_err(epoll_uring_error_to_std))
+                let ((file_guard, slice), res) =
+                    system.write(file_guard, offset, buf.into_raw_slice()).await;
+                (
+                    (file_guard, FullSlice::must_new(slice)),
+                    res.map_err(epoll_uring_error_to_std),
+                )
             }
         }
     }
diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
new file mode 100644
index 0000000000..7c773b6b21
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
@@ -0,0 +1,78 @@
+//! See [`FullSlice`].
+
+use bytes::{Bytes, BytesMut};
+use std::ops::{Deref, Range};
+use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+
+/// The true owned equivalent for Rust [`slice`]. Use this for the write path.
+///
+/// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`,
+/// [`FullSlice`] is guaranteed to have all its bytes initialized. This means that
+/// [`<FullSlice as Deref<Target = [u8]>>::len`] is equal to [`Slice::bytes_init`] and [`Slice::bytes_total`].
+///
+pub struct FullSlice<B> {
+    slice: Slice<B>,
+}
+
+impl<B> FullSlice<B>
+where
+    B: IoBuf,
+{
+    pub(crate) fn must_new(slice: Slice<B>) -> Self {
+        assert_eq!(slice.bytes_init(), slice.bytes_total());
+        FullSlice { slice }
+    }
+    pub(crate) fn into_raw_slice(self) -> Slice<B> {
+        let FullSlice { slice: s } = self;
+        s
+    }
+}
+
+impl<B> Deref for FullSlice<B>
+where
+    B: IoBuf,
+{
+    type Target = [u8];
+
+    fn deref(&self) -> &[u8] {
+        let rust_slice = &self.slice[..];
+        assert_eq!(rust_slice.len(), self.slice.bytes_init());
+        assert_eq!(rust_slice.len(), self.slice.bytes_total());
+        rust_slice
+    }
+}
+
+pub(crate) trait IoBufExt {
+    /// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`.
+    fn slice_len(self) -> FullSlice<Self>
+    where
+        Self: Sized;
+}
+
+macro_rules! impl_io_buf_ext {
+    ($T:ty) => {
+        impl IoBufExt for $T {
+            #[inline(always)]
+            fn slice_len(self) -> FullSlice<Self> {
+                let len = self.len();
+                let s = if len == 0 {
+                    // `BoundedBuf::slice(0..len)` or `BoundedBuf::slice(..)` has an incorrect assertion,
+                    // causing a panic if len == 0.
+                    // The Slice::from_buf_bounds has the correct assertion (<= instead of <).
+                    // => https://github.com/neondatabase/tokio-epoll-uring/issues/46
+                    let slice = self.slice_full();
+                    let mut bounds: Range<_> = slice.bounds();
+                    bounds.end = bounds.start;
+                    Slice::from_buf_bounds(slice.into_inner(), bounds)
+                } else {
+                    self.slice(0..len)
+                };
+                FullSlice::must_new(s)
+            }
+        }
+    };
+}
+
+impl_io_buf_ext!(Bytes);
+impl_io_buf_ext!(BytesMut);
+impl_io_buf_ext!(Vec<u8>);
diff --git a/pageserver/src/virtual_file/owned_buffers_io/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/slice.rs
index d19e5ddffe..6100593663 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/slice.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs
@@ -3,14 +3,14 @@ use tokio_epoll_uring::BoundedBufMut;
 use tokio_epoll_uring::IoBufMut;
 use tokio_epoll_uring::Slice;
 
-pub(crate) trait SliceExt {
+pub(crate) trait SliceMutExt {
     /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO.
     ///
     /// See the test case `test_slice_full_zeroed` for the difference to just doing `&slice[..]`
     fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8];
 }
 
-impl<B> SliceExt for Slice<B>
+impl<B> SliceMutExt for Slice<B>
 where
     B: IoBufMut,
 {
diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
index 55b1d0b46b..efcb61ba65 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -1,5 +1,8 @@
-use crate::{context::RequestContext, virtual_file::owned_buffers_io::write::OwnedAsyncWriter};
-use tokio_epoll_uring::{BoundedBuf, IoBuf};
+use crate::{
+    context::RequestContext,
+    virtual_file::owned_buffers_io::{io_buf_ext::FullSlice, write::OwnedAsyncWriter},
+};
+use tokio_epoll_uring::IoBuf;
 
 pub struct Writer<W> {
     dst: W,
@@ -35,11 +38,11 @@ where
     W: OwnedAsyncWriter,
 {
     #[inline(always)]
-    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    async fn write_all<Buf: IoBuf + Send>(
         &mut self,
-        buf: B,
+        buf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, B::Buf)> {
+    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
         let (nwritten, buf) = self.dst.write_all(buf, ctx).await?;
         self.bytes_amount += u64::try_from(nwritten).unwrap();
         Ok((nwritten, buf))
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 8599d95cdf..f8f37b17e3 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -1,16 +1,18 @@
 use bytes::BytesMut;
-use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+use tokio_epoll_uring::IoBuf;
 
 use crate::context::RequestContext;
 
+use super::io_buf_ext::{FullSlice, IoBufExt};
+
 /// A trait for doing owned-buffer write IO.
 /// Think [`tokio::io::AsyncWrite`] but with owned buffers.
 pub trait OwnedAsyncWriter {
-    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    async fn write_all<Buf: IoBuf + Send>(
         &mut self,
-        buf: B,
+        buf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, B::Buf)>;
+    ) -> std::io::Result<(usize, FullSlice<Buf>)>;
 }
 
 /// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
@@ -79,9 +81,11 @@ where
     #[cfg_attr(target_os = "macos", allow(dead_code))]
     pub async fn write_buffered<S: IoBuf + Send>(
         &mut self,
-        chunk: Slice<S>,
+        chunk: FullSlice<S>,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, S)> {
+    ) -> std::io::Result<(usize, FullSlice<S>)> {
+        let chunk = chunk.into_raw_slice();
+
         let chunk_len = chunk.len();
         // avoid memcpy for the middle of the chunk
         if chunk.len() >= self.buf().cap() {
@@ -94,7 +98,10 @@ where
                     .pending(),
                 0
             );
-            let (nwritten, chunk) = self.writer.write_all(chunk, ctx).await?;
+            let (nwritten, chunk) = self
+                .writer
+                .write_all(FullSlice::must_new(chunk), ctx)
+                .await?;
             assert_eq!(nwritten, chunk_len);
             return Ok((nwritten, chunk));
         }
@@ -114,7 +121,7 @@ where
             }
         }
         assert!(slice.is_empty(), "by now we should have drained the chunk");
-        Ok((chunk_len, chunk.into_inner()))
+        Ok((chunk_len, FullSlice::must_new(chunk)))
     }
 
     /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
@@ -150,9 +157,12 @@ where
             self.buf = Some(buf);
             return Ok(());
         }
-        let (nwritten, io_buf) = self.writer.write_all(buf.flush(), ctx).await?;
+        let slice = buf.flush();
+        let (nwritten, slice) = self.writer.write_all(slice, ctx).await?;
         assert_eq!(nwritten, buf_len);
-        self.buf = Some(Buffer::reuse_after_flush(io_buf));
+        self.buf = Some(Buffer::reuse_after_flush(
+            slice.into_raw_slice().into_inner(),
+        ));
         Ok(())
     }
 }
@@ -172,9 +182,9 @@ pub trait Buffer {
     /// Number of bytes in the buffer.
     fn pending(&self) -> usize;
 
-    /// Turns `self` into a [`tokio_epoll_uring::Slice`] of the pending data
+    /// Turns `self` into a [`FullSlice`] of the pending data
     /// so we can use [`tokio_epoll_uring`] to write it to disk.
-    fn flush(self) -> Slice<Self::IoBuf>;
+    fn flush(self) -> FullSlice<Self::IoBuf>;
 
     /// After the write to disk is done and we have gotten back the slice,
     /// [`BufferedWriter`] uses this method to re-use the io buffer.
@@ -198,12 +208,8 @@ impl Buffer for BytesMut {
         self.len()
     }
 
-    fn flush(self) -> Slice<BytesMut> {
-        if self.is_empty() {
-            return self.slice_full();
-        }
-        let len = self.len();
-        self.slice(0..len)
+    fn flush(self) -> FullSlice<BytesMut> {
+        self.slice_len()
     }
 
     fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
@@ -213,18 +219,13 @@ impl Buffer for BytesMut {
 }
 
 impl OwnedAsyncWriter for Vec<u8> {
-    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    async fn write_all<Buf: IoBuf + Send>(
         &mut self,
-        buf: B,
+        buf: FullSlice<Buf>,
         _: &RequestContext,
-    ) -> std::io::Result<(usize, B::Buf)> {
-        let nbytes = buf.bytes_init();
-        if nbytes == 0 {
-            return Ok((0, Slice::into_inner(buf.slice_full())));
-        }
-        let buf = buf.slice(0..nbytes);
+    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
         self.extend_from_slice(&buf[..]);
-        Ok((buf.len(), Slice::into_inner(buf)))
+        Ok((buf.len(), buf))
     }
 }
 
@@ -241,19 +242,13 @@ mod tests {
         writes: Vec<Vec<u8>>,
     }
     impl OwnedAsyncWriter for RecorderWriter {
-        async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        async fn write_all<Buf: IoBuf + Send>(
             &mut self,
-            buf: B,
+            buf: FullSlice<Buf>,
             _: &RequestContext,
-        ) -> std::io::Result<(usize, B::Buf)> {
-            let nbytes = buf.bytes_init();
-            if nbytes == 0 {
-                self.writes.push(vec![]);
-                return Ok((0, Slice::into_inner(buf.slice_full())));
-            }
-            let buf = buf.slice(0..nbytes);
+        ) -> std::io::Result<(usize, FullSlice<Buf>)> {
             self.writes.push(Vec::from(&buf[..]));
-            Ok((buf.len(), Slice::into_inner(buf)))
+            Ok((buf.len(), buf))
         }
     }
 
@@ -264,7 +259,7 @@ mod tests {
     macro_rules! write {
         ($writer:ident, $data:literal) => {{
             $writer
-                .write_buffered(::bytes::Bytes::from_static($data).slice_full(), &test_ctx())
+                .write_buffered(::bytes::Bytes::from_static($data).slice_len(), &test_ctx())
                 .await?;
         }};
     }

From fef77b0cc981f71238e1117d392ea55ec867e61f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 15 Aug 2024 09:02:33 +0100
Subject: [PATCH 45/62] safekeeper: consider partial uploads when pulling
 timeline (#8628)

## Problem
The control file contains the id of the safekeeper that uploaded it.
Previously, when sending a snapshot of the control file to another sk,
it would eventually be gc-ed by the receiving sk. This is incorrect
because the original sk might still need it later.

## Summary of Changes
When sending a snapshot and the control file contains an uploaded
segment:
* Create a copy of the segment in s3 with the destination sk in the
  object name
* Tweak the streamed control file to point to the object create in the
  previous step

Note that the snapshot endpoint now has to know the id of the requestor,
so the api has been extended to include the node if of the destination
sk.

Closes https://github.com/neondatabase/neon/issues/8542
---
 safekeeper/src/control_file.rs           |  42 +++---
 safekeeper/src/http/client.rs            |   7 +-
 safekeeper/src/http/routes.rs            |  11 +-
 safekeeper/src/pull_timeline.rs          |  64 +++++++---
 safekeeper/src/wal_backup.rs             |  10 ++
 safekeeper/src/wal_backup_partial.rs     |  57 ++++++++-
 test_runner/fixtures/neon_fixtures.py    |  23 +++-
 test_runner/regress/test_wal_acceptor.py | 155 ++++++++++++++++++++++-
 8 files changed, 327 insertions(+), 42 deletions(-)

diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index d574bb438f..c551cd3122 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -164,6 +164,30 @@ impl Deref for FileStorage {
     }
 }
 
+impl TimelinePersistentState {
+    pub(crate) fn write_to_buf(&self) -> Result<Vec<u8>> {
+        let mut buf: Vec<u8> = Vec::new();
+        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
+
+        if self.eviction_state == EvictionState::Present {
+            // temp hack for forward compatibility
+            const PREV_FORMAT_VERSION: u32 = 8;
+            let prev = downgrade_v9_to_v8(self);
+            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
+            prev.ser_into(&mut buf)?;
+        } else {
+            // otherwise, we write the current format version
+            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
+            self.ser_into(&mut buf)?;
+        }
+
+        // calculate checksum before resize
+        let checksum = crc32c::crc32c(&buf);
+        buf.extend_from_slice(&checksum.to_le_bytes());
+        Ok(buf)
+    }
+}
+
 #[async_trait::async_trait]
 impl Storage for FileStorage {
     /// Persists state durably to the underlying storage.
@@ -180,24 +204,8 @@ impl Storage for FileStorage {
                 &control_partial_path
             )
         })?;
-        let mut buf: Vec<u8> = Vec::new();
-        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
 
-        if s.eviction_state == EvictionState::Present {
-            // temp hack for forward compatibility
-            const PREV_FORMAT_VERSION: u32 = 8;
-            let prev = downgrade_v9_to_v8(s);
-            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
-            prev.ser_into(&mut buf)?;
-        } else {
-            // otherwise, we write the current format version
-            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
-            s.ser_into(&mut buf)?;
-        }
-
-        // calculate checksum before resize
-        let checksum = crc32c::crc32c(&buf);
-        buf.extend_from_slice(&checksum.to_le_bytes());
+        let buf: Vec<u8> = s.write_to_buf()?;
 
         control_partial.write_all(&buf).await.with_context(|| {
             format!(
diff --git a/safekeeper/src/http/client.rs b/safekeeper/src/http/client.rs
index 0bb31c200d..c56f7880d4 100644
--- a/safekeeper/src/http/client.rs
+++ b/safekeeper/src/http/client.rs
@@ -10,7 +10,7 @@
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
     http::error::HttpErrorBody,
-    id::{TenantId, TimelineId},
+    id::{NodeId, TenantId, TimelineId},
     logging::SecretString,
 };
 
@@ -97,10 +97,11 @@ impl Client {
         &self,
         tenant_id: TenantId,
         timeline_id: TimelineId,
+        stream_to: NodeId,
     ) -> Result<reqwest::Response> {
         let uri = format!(
-            "{}/v1/tenant/{}/timeline/{}/snapshot",
-            self.mgmt_api_endpoint, tenant_id, timeline_id
+            "{}/v1/tenant/{}/timeline/{}/snapshot/{}",
+            self.mgmt_api_endpoint, tenant_id, timeline_id, stream_to.0
         );
         self.get(&uri).await
     }
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index fe6d325cee..c9defb0bcf 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -205,6 +205,7 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
 
 /// Stream tar archive with all timeline data.
 async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let destination = parse_request_param(&request, "destination_id")?;
     let ttid = TenantTimelineId::new(
         parse_request_param(&request, "tenant_id")?,
         parse_request_param(&request, "timeline_id")?,
@@ -225,7 +226,13 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
     // so create the chan and write to it in another task.
     let (tx, rx) = mpsc::channel(1);
 
-    task::spawn(pull_timeline::stream_snapshot(tli, tx));
+    let conf = get_conf(&request);
+    task::spawn(pull_timeline::stream_snapshot(
+        tli,
+        conf.my_id,
+        destination,
+        tx,
+    ));
 
     let rx_stream = ReceiverStream::new(rx);
     let body = Body::wrap_stream(rx_stream);
@@ -565,7 +572,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             request_span(r, tenant_delete_handler)
         })
         .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot",
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id",
             |r| request_span(r, timeline_snapshot_handler),
         )
         .post("/v1/pull_timeline", |r| {
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 618c6b278f..1eacec9981 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -11,13 +11,8 @@ use std::{
     io::{self, ErrorKind},
     sync::Arc,
 };
-use tokio::{
-    fs::{File, OpenOptions},
-    io::AsyncWrite,
-    sync::mpsc,
-    task,
-};
-use tokio_tar::{Archive, Builder};
+use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task};
+use tokio_tar::{Archive, Builder, Header};
 use tokio_util::{
     io::{CopyToBytes, SinkWriter},
     sync::PollSender,
@@ -32,13 +27,15 @@ use crate::{
         routes::TimelineStatus,
     },
     safekeeper::Term,
+    state::TimelinePersistentState,
     timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError, WalResidentTimeline},
+    wal_backup,
     wal_storage::{self, open_wal_file, Storage},
     GlobalTimelines, SafeKeeperConf,
 };
 use utils::{
     crashsafe::{durable_rename, fsync_async_opt},
-    id::{TenantId, TenantTimelineId, TimelineId},
+    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
     logging::SecretString,
     lsn::Lsn,
     pausable_failpoint,
@@ -46,8 +43,13 @@ use utils::{
 
 /// Stream tar archive of timeline to tx.
 #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
-pub async fn stream_snapshot(tli: WalResidentTimeline, tx: mpsc::Sender<Result<Bytes>>) {
-    if let Err(e) = stream_snapshot_guts(tli, tx.clone()).await {
+pub async fn stream_snapshot(
+    tli: WalResidentTimeline,
+    source: NodeId,
+    destination: NodeId,
+    tx: mpsc::Sender<Result<Bytes>>,
+) {
+    if let Err(e) = stream_snapshot_guts(tli, source, destination, tx.clone()).await {
         // Error type/contents don't matter as they won't can't reach the client
         // (hyper likely doesn't do anything with it), but http stream will be
         // prematurely terminated. It would be nice to try to send the error in
@@ -81,6 +83,8 @@ impl Drop for SnapshotContext {
 
 pub async fn stream_snapshot_guts(
     tli: WalResidentTimeline,
+    source: NodeId,
+    destination: NodeId,
     tx: mpsc::Sender<Result<Bytes>>,
 ) -> Result<()> {
     // tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>;
@@ -104,7 +108,7 @@ pub async fn stream_snapshot_guts(
     // which is also likely suboptimal.
     let mut ar = Builder::new_non_terminated(pinned_writer);
 
-    let bctx = tli.start_snapshot(&mut ar).await?;
+    let bctx = tli.start_snapshot(&mut ar, source, destination).await?;
     pausable_failpoint!("sk-snapshot-after-list-pausable");
 
     let tli_dir = tli.get_timeline_dir();
@@ -158,13 +162,43 @@ impl WalResidentTimeline {
     async fn start_snapshot<W: AsyncWrite + Unpin + Send>(
         &self,
         ar: &mut tokio_tar::Builder<W>,
+        source: NodeId,
+        destination: NodeId,
     ) -> Result<SnapshotContext> {
         let mut shared_state = self.write_shared_state().await;
         let wal_seg_size = shared_state.get_wal_seg_size();
 
-        let cf_path = self.get_timeline_dir().join(CONTROL_FILE_NAME);
-        let mut cf = File::open(cf_path).await?;
-        ar.append_file(CONTROL_FILE_NAME, &mut cf).await?;
+        let mut control_store = TimelinePersistentState::clone(shared_state.sk.state());
+        // Modify the partial segment of the in-memory copy for the control file to
+        // point to the destination safekeeper.
+        let replace = control_store
+            .partial_backup
+            .replace_uploaded_segment(source, destination)?;
+
+        if let Some(replace) = replace {
+            // The deserialized control file has an uploaded partial. We upload a copy
+            // of it to object storage for the destination safekeeper and send an updated
+            // control file in the snapshot.
+            tracing::info!(
+                "Replacing uploaded partial segment in in-mem control file: {replace:?}"
+            );
+
+            let remote_timeline_path = wal_backup::remote_timeline_path(&self.tli.ttid)?;
+            wal_backup::copy_partial_segment(
+                &replace.previous.remote_path(&remote_timeline_path),
+                &replace.current.remote_path(&remote_timeline_path),
+            )
+            .await?;
+        }
+
+        let buf = control_store
+            .write_to_buf()
+            .with_context(|| "failed to serialize control store")?;
+        let mut header = Header::new_gnu();
+        header.set_size(buf.len().try_into().expect("never breaches u64"));
+        ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice())
+            .await
+            .with_context(|| "failed to append to archive")?;
 
         // We need to stream since the oldest segment someone (s3 or pageserver)
         // still needs. This duplicates calc_horizon_lsn logic.
@@ -342,7 +376,7 @@ async fn pull_timeline(
     let client = Client::new(host.clone(), sk_auth_token.clone());
     // Request stream with basebackup archive.
     let bb_resp = client
-        .snapshot(status.tenant_id, status.timeline_id)
+        .snapshot(status.tenant_id, status.timeline_id, conf.my_id)
         .await?;
 
     // Make Stream of Bytes from it...
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 234273e133..aa1a6696a1 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -483,6 +483,16 @@ pub(crate) async fn backup_partial_segment(
         .await
 }
 
+pub(crate) async fn copy_partial_segment(
+    source: &RemotePath,
+    destination: &RemotePath,
+) -> Result<()> {
+    let storage = get_configured_remote_storage();
+    let cancel = CancellationToken::new();
+
+    storage.copy_object(source, destination, &cancel).await
+}
+
 pub async fn read_object(
     file_path: &RemotePath,
     offset: u64,
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 52765b0e98..675a051887 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -17,14 +17,13 @@
 //! file. Code updates state in the control file before doing any S3 operations.
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
-
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
 
 use tracing::{debug, error, info, instrument, warn};
-use utils::lsn::Lsn;
+use utils::{id::NodeId, lsn::Lsn};
 
 use crate::{
     metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
@@ -82,6 +81,12 @@ pub struct State {
     pub segments: Vec<PartialRemoteSegment>,
 }
 
+#[derive(Debug)]
+pub(crate) struct ReplaceUploadedSegment {
+    pub(crate) previous: PartialRemoteSegment,
+    pub(crate) current: PartialRemoteSegment,
+}
+
 impl State {
     /// Find an Uploaded segment. There should be only one Uploaded segment at a time.
     pub(crate) fn uploaded_segment(&self) -> Option<PartialRemoteSegment> {
@@ -90,6 +95,54 @@ impl State {
             .find(|seg| seg.status == UploadStatus::Uploaded)
             .cloned()
     }
+
+    /// Replace the name of the Uploaded segment (if one exists) in order to match
+    /// it with `destination` safekeeper. Returns a description of the change or None
+    /// wrapped in anyhow::Result.
+    pub(crate) fn replace_uploaded_segment(
+        &mut self,
+        source: NodeId,
+        destination: NodeId,
+    ) -> anyhow::Result<Option<ReplaceUploadedSegment>> {
+        let current = self
+            .segments
+            .iter_mut()
+            .find(|seg| seg.status == UploadStatus::Uploaded);
+
+        let current = match current {
+            Some(some) => some,
+            None => {
+                return anyhow::Ok(None);
+            }
+        };
+
+        // Sanity check that the partial segment we are replacing is belongs
+        // to the `source` SK.
+        if !current
+            .name
+            .ends_with(format!("sk{}.partial", source.0).as_str())
+        {
+            anyhow::bail!(
+                "Partial segment name ({}) doesn't match self node id ({})",
+                current.name,
+                source
+            );
+        }
+
+        let previous = current.clone();
+
+        let new_name = current.name.replace(
+            format!("_sk{}", source.0).as_str(),
+            format!("_sk{}", destination.0).as_str(),
+        );
+
+        current.name = new_name;
+
+        anyhow::Ok(Some(ReplaceUploadedSegment {
+            previous,
+            current: current.clone(),
+        }))
+    }
 }
 
 struct PartialBackup {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index aaa1f21997..b76432127d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -67,6 +67,7 @@ from fixtures.pageserver.utils import (
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import (
+    LocalFsStorage,
     MockS3Server,
     RemoteStorage,
     RemoteStorageKind,
@@ -4425,14 +4426,32 @@ class Safekeeper(LogUtils):
     def timeline_dir(self, tenant_id, timeline_id) -> Path:
         return self.data_dir / str(tenant_id) / str(timeline_id)
 
+    def list_uploaded_segments(self, tenant_id: TenantId, timeline_id: TimelineId):
+        tline_path = (
+            self.env.repo_dir
+            / "local_fs_remote_storage"
+            / "safekeeper"
+            / str(tenant_id)
+            / str(timeline_id)
+        )
+        assert isinstance(self.env.safekeepers_remote_storage, LocalFsStorage)
+        return self._list_segments_in_dir(
+            tline_path, lambda name: ".metadata" not in name and ".___temp" not in name
+        )
+
     def list_segments(self, tenant_id, timeline_id) -> List[str]:
         """
         Get list of segment names of the given timeline.
         """
         tli_dir = self.timeline_dir(tenant_id, timeline_id)
+        return self._list_segments_in_dir(
+            tli_dir, lambda name: not name.startswith("safekeeper.control")
+        )
+
+    def _list_segments_in_dir(self, path: Path, keep_filter: Callable[[str], bool]) -> list[str]:
         segments = []
-        for _, _, filenames in os.walk(tli_dir):
-            segments.extend([f for f in filenames if not f.startswith("safekeeper.control")])
+        for _, _, filenames in os.walk(path):
+            segments.extend([f for f in filenames if keep_filter(f)])
         segments.sort()
         return segments
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index bf7829fc84..5d3b263936 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -49,7 +49,13 @@ from fixtures.remote_storage import (
 )
 from fixtures.safekeeper.http import SafekeeperHttpClient
 from fixtures.safekeeper.utils import are_walreceivers_absent
-from fixtures.utils import PropagatingThread, get_dir_size, query_scalar, start_in_background
+from fixtures.utils import (
+    PropagatingThread,
+    get_dir_size,
+    query_scalar,
+    start_in_background,
+    wait_until,
+)
 
 
 def wait_lsn_force_checkpoint(
@@ -63,6 +69,18 @@ def wait_lsn_force_checkpoint(
     lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
     log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver")
 
+    wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
+
+
+def wait_lsn_force_checkpoint_at(
+    lsn: Lsn,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ps: NeonPageserver,
+    pageserver_conn_options=None,
+):
+    pageserver_conn_options = pageserver_conn_options or {}
+
     auth_token = None
     if "password" in pageserver_conn_options:
         auth_token = pageserver_conn_options["password"]
@@ -2304,3 +2322,138 @@ def test_s3_eviction(
     )
 
     assert event_metrics_seen
+
+
+def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilder):
+    """
+    Verify that pulling timeline from a SK with an uploaded partial segment
+    does not lead to consistency issues:
+    1. Start 3 SKs - only use two
+    2. Ingest a bit of WAL
+    3. Wait for partial to be uploaded
+    4. Pull timeline to the third SK
+    6. Replace source with destination SK and start compute
+    5. Wait for source SK to evict timeline
+    6. Go back to initial compute SK config and validate that
+    source SK can unevict the timeline (S3 state is consistent)
+    """
+    neon_env_builder.auth_enabled = True
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
+
+    neon_env_builder.safekeeper_extra_opts = [
+        "--enable-offload",
+        "--delete-offloaded-wal",
+        "--partial-backup-timeout",
+        "500ms",
+        "--control-file-save-interval",
+        "500ms",
+        "--eviction-min-resident=500ms",
+    ]
+
+    env = neon_env_builder.init_start(initial_tenant_conf={"checkpoint_timeout": "100ms"})
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2])
+
+    log.info("use only first 2 safekeepers, 3rd will be seeded")
+    endpoint = env.endpoints.create("main")
+    endpoint.active_safekeepers = [1, 2]
+    endpoint.start()
+    endpoint.safe_psql("create table t(key int, value text)")
+    endpoint.safe_psql("insert into t select generate_series(1, 180000), 'papaya'")
+
+    endpoint.stop()
+
+    def source_partial_segment_uploaded():
+        first_segment_name = "000000010000000000000001"
+        segs = src_sk.list_uploaded_segments(tenant_id, timeline_id)
+
+        candidate_seg = None
+        for seg in segs:
+            if "partial" in seg and "sk1" in seg and not seg.startswith(first_segment_name):
+                candidate_seg = seg
+
+        if candidate_seg is not None:
+            # The term might change, causing the segment to be gc-ed shortly after,
+            # so give it a bit of time to make sure it's stable.
+            time.sleep(2)
+
+            segs = src_sk.list_uploaded_segments(tenant_id, timeline_id)
+            assert candidate_seg in segs
+            return candidate_seg
+
+        raise Exception("Partial segment not uploaded yet")
+
+    source_partial_segment = wait_until(15, 1, source_partial_segment_uploaded)
+    log.info(
+        f"Uploaded segments before pull are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
+    )
+    log.info(f"Tracking source partial segment: {source_partial_segment}")
+
+    src_flush_lsn = src_sk.get_flush_lsn(tenant_id, timeline_id)
+    log.info(f"flush_lsn on src before pull_timeline: {src_flush_lsn}")
+
+    pageserver_conn_options = {"password": env.auth_keys.generate_tenant_token(tenant_id)}
+    wait_lsn_force_checkpoint_at(
+        src_flush_lsn, tenant_id, timeline_id, env.pageserver, pageserver_conn_options
+    )
+
+    dst_sk.pull_timeline([src_sk], tenant_id, timeline_id)
+
+    def evicted():
+        evictions = src_sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_completed_total", {"kind": "evict"}
+        )
+
+        if evictions is None or evictions == 0:
+            raise Exception("Eviction did not happen on source safekeeper yet")
+
+    wait_until(30, 1, evicted)
+
+    endpoint.start(safekeepers=[2, 3])
+
+    def new_partial_segment_uploaded():
+        segs = src_sk.list_uploaded_segments(tenant_id, timeline_id)
+        for seg in segs:
+            if "partial" in seg and "sk3" in seg:
+                return seg
+
+        raise Exception("Partial segment not uploaded yet")
+
+    log.info(
+        f"Uploaded segments before post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
+    )
+
+    endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
+    wait_until(15, 1, new_partial_segment_uploaded)
+
+    log.info(
+        f"Uploaded segments after post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
+    )
+
+    # Allow for some gc iterations to happen and assert that the original
+    # uploaded partial segment remains in place.
+    time.sleep(5)
+    segs = src_sk.list_uploaded_segments(tenant_id, timeline_id)
+    assert source_partial_segment in segs
+
+    log.info(
+        f"Uploaded segments at the end are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
+    )
+
+    # Restart the endpoint in order to check that the source safekeeper
+    # can unevict the timeline
+    endpoint.stop()
+    endpoint.start(safekeepers=[1, 2])
+
+    def unevicted():
+        unevictions = src_sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_completed_total", {"kind": "restore"}
+        )
+
+        if unevictions is None or unevictions == 0:
+            raise Exception("Uneviction did not happen on source safekeeper yet")
+
+    wait_until(10, 1, unevicted)

From a9c28be7d02226032f153edf6c7b527aec9fa5db Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 15 Aug 2024 10:06:28 +0100
Subject: [PATCH 46/62] fix(pageserver): allow unused_imports in download.rs on
 macOS (#8733)

## Problem

On macOS, clippy fails with the following error:

```
error: unused import: `crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt`
  --> pageserver/src/tenant/remote_timeline_client/download.rs:26:5
   |
26 | use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   |
   = note: `-D unused-imports` implied by `-D warnings`
   = help: to override `-D warnings` add `#[allow(unused_imports)]`
```

Introduced in https://github.com/neondatabase/neon/pull/8717

## Summary of changes
- allow `unused_imports` for
`crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt` on macOS
in download.rs
---
 pageserver/src/tenant/remote_timeline_client/download.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 8199218c3c..d9725ad756 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -23,6 +23,7 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
+#[cfg_attr(target_os = "macos", allow(unused_imports))]
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;

From d9a57aeed9ca9b0e2134e7183355d52fb6a089d1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 15 Aug 2024 12:54:05 +0300
Subject: [PATCH 47/62] storcon: deny external node configuration if an
 operation is ongoing (#8727)

Per #8674, disallow node configuration while drain/fill are ongoing.
Implement it by adding a only-http wrapper
`Service::external_node_configure` which checks for operation existing
before configuring.

Additionally:
- allow cancelling drain/fill after a pageserver has restarted and
transitioned to WarmingUp

Fixes: #8674
---
 libs/pageserver_api/src/controller_api.rs     |  3 --
 storage_controller/src/http.rs                |  2 +-
 storage_controller/src/service.rs             | 42 +++++++++++-------
 .../regress/test_storage_controller.py        | 44 +++++++++++++++++++
 4 files changed, 70 insertions(+), 21 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index a5b452da83..a50707a1b8 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -313,20 +313,17 @@ pub struct MetadataHealthUpdateRequest {
 pub struct MetadataHealthUpdateResponse {}
 
 #[derive(Serialize, Deserialize, Debug)]
-
 pub struct MetadataHealthListUnhealthyResponse {
     pub unhealthy_tenant_shards: Vec<TenantShardId>,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
-
 pub struct MetadataHealthListOutdatedRequest {
     #[serde(with = "humantime_serde")]
     pub not_scrubbed_for: Duration,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
-
 pub struct MetadataHealthListOutdatedResponse {
     pub health_records: Vec<MetadataHealthRecord>,
 }
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index e8513b31eb..e755aaed19 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -500,7 +500,7 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
         StatusCode::OK,
         state
             .service
-            .node_configure(
+            .external_node_configure(
                 config_req.node_id,
                 config_req.availability.map(NodeAvailability::from),
                 config_req.scheduling,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ef4cd91efd..d717924ae6 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4912,6 +4912,26 @@ impl Service {
         Ok(())
     }
 
+    /// Wrapper around [`Self::node_configure`] which only allows changes while there is no ongoing
+    /// operation for HTTP api.
+    pub(crate) async fn external_node_configure(
+        &self,
+        node_id: NodeId,
+        availability: Option<NodeAvailability>,
+        scheduling: Option<NodeSchedulingPolicy>,
+    ) -> Result<(), ApiError> {
+        {
+            let locked = self.inner.read().unwrap();
+            if let Some(op) = locked.ongoing_operation.as_ref().map(|op| op.operation) {
+                return Err(ApiError::PreconditionFailed(
+                    format!("Ongoing background operation forbids configuring: {op}").into(),
+                ));
+            }
+        }
+
+        self.node_configure(node_id, availability, scheduling).await
+    }
+
     pub(crate) async fn start_node_drain(
         self: &Arc<Self>,
         node_id: NodeId,
@@ -5017,14 +5037,14 @@ impl Service {
     }
 
     pub(crate) async fn cancel_node_drain(&self, node_id: NodeId) -> Result<(), ApiError> {
-        let (node_available, node_policy) = {
+        let node_available = {
             let locked = self.inner.read().unwrap();
             let nodes = &locked.nodes;
             let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
                 anyhow::anyhow!("Node {} not registered", node_id).into(),
             ))?;
 
-            (node.is_available(), node.get_scheduling())
+            node.is_available()
         };
 
         if !node_available {
@@ -5033,12 +5053,6 @@ impl Service {
             ));
         }
 
-        if !matches!(node_policy, NodeSchedulingPolicy::Draining) {
-            return Err(ApiError::PreconditionFailed(
-                format!("Node {node_id} has no drain in progress").into(),
-            ));
-        }
-
         if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
             if let Operation::Drain(drain) = op_handler.operation {
                 if drain.node_id == node_id {
@@ -5152,14 +5166,14 @@ impl Service {
     }
 
     pub(crate) async fn cancel_node_fill(&self, node_id: NodeId) -> Result<(), ApiError> {
-        let (node_available, node_policy) = {
+        let node_available = {
             let locked = self.inner.read().unwrap();
             let nodes = &locked.nodes;
             let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
                 anyhow::anyhow!("Node {} not registered", node_id).into(),
             ))?;
 
-            (node.is_available(), node.get_scheduling())
+            node.is_available()
         };
 
         if !node_available {
@@ -5168,12 +5182,6 @@ impl Service {
             ));
         }
 
-        if !matches!(node_policy, NodeSchedulingPolicy::Filling) {
-            return Err(ApiError::PreconditionFailed(
-                format!("Node {node_id} has no fill in progress").into(),
-            ));
-        }
-
         if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
             if let Operation::Fill(fill) = op_handler.operation {
                 if fill.node_id == node_id {
@@ -5982,7 +5990,7 @@ impl Service {
                 .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
                 .await;
 
-            failpoint_support::sleep_millis_async!("sleepy-drain-loop");
+            failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel);
         }
 
         while !waiters.is_empty() {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 9b2557a165..7d98ff2923 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2091,3 +2091,47 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
         )
         == 0
     )
+
+
+def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
+    # single unsharded tenant, two locations
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_start()
+
+    env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}})
+    env.storage_controller.reconcile_until_idle()
+
+    attached_id = int(env.storage_controller.locate(env.initial_tenant)[0]["node_id"])
+    attached = next((ps for ps in env.pageservers if ps.id == attached_id))
+
+    def attached_is_draining():
+        details = env.storage_controller.node_status(attached.id)
+        assert details["scheduling"] == "Draining"
+
+    env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(10000)"))
+    env.storage_controller.node_drain(attached.id)
+
+    wait_until(10, 0.5, attached_is_draining)
+
+    attached.restart()
+
+    # we are unable to reconfigure node while the operation is still ongoing
+    with pytest.raises(
+        StorageControllerApiException,
+        match="Precondition failed: Ongoing background operation forbids configuring: drain.*",
+    ):
+        env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"})
+    with pytest.raises(
+        StorageControllerApiException,
+        match="Precondition failed: Ongoing background operation forbids configuring: drain.*",
+    ):
+        env.storage_controller.node_configure(attached.id, {"availability": "Offline"})
+
+    env.storage_controller.cancel_node_drain(attached.id)
+
+    def reconfigure_node_again():
+        env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"})
+
+    # allow for small delay between actually having cancelled and being able reconfigure again
+    wait_until(4, 0.5, reconfigure_node_again)

From 52641eb8533ec0bdd70523f2595a0265c9208dc7 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 15 Aug 2024 15:30:04 +0300
Subject: [PATCH 48/62] storcon: add spans to drain/fill ops (#8735)

this way we do not need to repeat the %node_id everywhere, and we get no
stray messages in logs from within the op.
---
 storage_controller/src/service.rs | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index d717924ae6..84db088a42 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4989,6 +4989,8 @@ impl Service {
                     cancel: cancel.clone(),
                 });
 
+                let span = tracing::info_span!(parent: None, "drain_node", %node_id);
+
                 tokio::task::spawn({
                     let service = self.clone();
                     let cancel = cancel.clone();
@@ -5005,21 +5007,21 @@ impl Service {
                             }
                         }
 
-                        tracing::info!(%node_id, "Drain background operation starting");
+                        tracing::info!("Drain background operation starting");
                         let res = service.drain_node(node_id, cancel).await;
                         match res {
                             Ok(()) => {
-                                tracing::info!(%node_id, "Drain background operation completed successfully");
+                                tracing::info!("Drain background operation completed successfully");
                             }
                             Err(OperationError::Cancelled) => {
-                                tracing::info!(%node_id, "Drain background operation was cancelled");
+                                tracing::info!("Drain background operation was cancelled");
                             }
                             Err(err) => {
-                                tracing::error!(%node_id, "Drain background operation encountered: {err}")
+                                tracing::error!("Drain background operation encountered: {err}")
                             }
                         }
                     }
-                });
+                }.instrument(span));
             }
             NodeSchedulingPolicy::Draining => {
                 return Err(ApiError::Conflict(format!(
@@ -5118,6 +5120,8 @@ impl Service {
                     cancel: cancel.clone(),
                 });
 
+                let span = tracing::info_span!(parent: None, "fill_node", %node_id);
+
                 tokio::task::spawn({
                     let service = self.clone();
                     let cancel = cancel.clone();
@@ -5134,21 +5138,21 @@ impl Service {
                             }
                         }
 
-                        tracing::info!(%node_id, "Fill background operation starting");
+                        tracing::info!("Fill background operation starting");
                         let res = service.fill_node(node_id, cancel).await;
                         match res {
                             Ok(()) => {
-                                tracing::info!(%node_id, "Fill background operation completed successfully");
+                                tracing::info!("Fill background operation completed successfully");
                             }
                             Err(OperationError::Cancelled) => {
-                                tracing::info!(%node_id, "Fill background operation was cancelled");
+                                tracing::info!("Fill background operation was cancelled");
                             }
                             Err(err) => {
-                                tracing::error!(%node_id, "Fill background operation encountered: {err}")
+                                tracing::error!("Fill background operation encountered: {err}")
                             }
                         }
                     }
-                });
+                }.instrument(span));
             }
             NodeSchedulingPolicy::Filling => {
                 return Err(ApiError::Conflict(format!(

From 24d347f50b15bb8ba44f0b25589e180e6482e1a8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 15 Aug 2024 16:27:07 +0300
Subject: [PATCH 49/62] storcon: use tracing for logging panics (#8734)

this gives spans for panics, and does not globber loki output by writing
to stderr while all of the other logging is to stdout.

See: #3475
---
 storage_controller/src/main.rs | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 5a68799141..7387d36690 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -196,14 +196,26 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
 }
 
 fn main() -> anyhow::Result<()> {
-    let default_panic = std::panic::take_hook();
-    std::panic::set_hook(Box::new(move |info| {
-        default_panic(info);
-        std::process::exit(1);
-    }));
+    logging::init(
+        LogFormat::Plain,
+        logging::TracingErrorLayerEnablement::Disabled,
+        logging::Output::Stdout,
+    )?;
+
+    // log using tracing so we don't get confused output by default hook writing to stderr
+    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
 
     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
+    let hook = std::panic::take_hook();
+    std::panic::set_hook(Box::new(move |info| {
+        // let sentry send a message (and flush)
+        // and trace the error
+        hook(info);
+
+        std::process::exit(1);
+    }));
+
     tokio::runtime::Builder::new_current_thread()
         // We use spawn_blocking for database operations, so require approximately
         // as many blocking threads as we will open database connections.
@@ -217,12 +229,6 @@ fn main() -> anyhow::Result<()> {
 async fn async_main() -> anyhow::Result<()> {
     let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
 
-    logging::init(
-        LogFormat::Plain,
-        logging::TracingErrorLayerEnablement::Disabled,
-        logging::Output::Stdout,
-    )?;
-
     preinitialize_metrics();
 
     let args = Cli::parse();

From f087423a0111d4fb5ac1e12007447c56b2a1c2a6 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 15 Aug 2024 16:28:25 +0300
Subject: [PATCH 50/62] Handle reload config file request in LR monitor (#8732)

## Problem

Logical replication BGW checking replication lag is not reloading config

## Summary of changes

Add handling of reload config request

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/neon.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 784d0f1da3..fe8e276d1c 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -192,6 +192,13 @@ LogicalSlotsMonitorMain(Datum main_arg)
 	{
 		XLogRecPtr	cutoff_lsn;
 
+		/* In case of a SIGHUP, just reload the configuration. */
+		if (ConfigReloadPending)
+		{
+			ConfigReloadPending = false;
+			ProcessConfigFile(PGC_SIGHUP);
+		}
+
 		/*
 		 * If there are too many .snap files, just drop all logical slots to
 		 * prevent aux files bloat.

From 4e58fd93216c5274e49488de161dc9ce12abd82d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 15 Aug 2024 18:37:15 +0100
Subject: [PATCH 51/62] CI(label-for-external-users): use CI_ACCESS_TOKEN
 (#8738)

## Problem

`secrets.GITHUB_TOKEN` (with any permissions) is not enough to get
a user's membership info if they decide to hide it.

## Summary of changes
- Use `secrets.CI_ACCESS_TOKEN` for `gh api` call
- Use `pull_request_target` instead of `pull_request` event to get
access to secrets
---
 .github/workflows/label-for-external-users.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml
index 7cf5ee254c..585d118dfb 100644
--- a/.github/workflows/label-for-external-users.yml
+++ b/.github/workflows/label-for-external-users.yml
@@ -4,7 +4,7 @@ on:
   issues:
     types:
       - opened
-  pull_request:
+  pull_request_target:
     types:
       - opened
 
@@ -25,7 +25,7 @@ jobs:
     - name: Check whether `${{ github.actor }}` is a member of `${{ github.repository_owner }}`
       id: check-user
       env:
-        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
       run: |
         if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
           is_member=true
@@ -45,10 +45,10 @@ jobs:
       issues: write        # for `gh issue edit`
 
     steps:
-    - name: Label new ${{ github.event_name }}
+    - name: Add `${{ env.LABEL }}` label
       env:
         GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].number }}
-        GH_CLI_COMMAND: ${{ github.event_name == 'pull_request' && 'pr' || 'issue' }}
+        ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request_target' && 'pull_request' || 'issue'].number }}
+        GH_CLI_COMMAND: ${{ github.event_name == 'pull_request_target' && 'pr' || 'issue' }}
       run: |
         gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER}

From 69cb1ee479ecdc99dd117fe4149b59dd54676fea Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 15 Aug 2024 22:41:58 +0100
Subject: [PATCH 52/62] CI(replication-tests): store test results & change
 notification channel (#8687)

## Problem

We want to store Nightly Replication test results in the database and
notify the relevant Slack channel about failures

## Summary of changes
- Store test results in the database
- Notify `on-call-compute-staging-stream` about failures
---
 .github/workflows/benchmarking.yml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index f99a037489..a4a597acde 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -222,13 +222,20 @@ jobs:
       id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
+      with:
+        store-test-results-into-db: true
+      env:
+        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream
+        slack-message: |
+          Periodic replication testing: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -330,7 +337,7 @@ jobs:
   prepare_AWS_RDS_databases:
     uses: ./.github/workflows/_benchmarking_preparation.yml
     secrets: inherit
-  
+
   pgbench-compare:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     needs: [ generate-matrices, prepare_AWS_RDS_databases ]

From df086cd139ee5ecc82bf096fc3fc6ee4397ac983 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 15 Aug 2024 15:34:45 -0700
Subject: [PATCH 53/62] Add logical replication test to exercise snapfiles
 (#8364)

---
 .../performance/test_logical_replication.py   | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 4b4ffc1fee..c4e42a7834 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -262,3 +262,85 @@ def test_publisher_restart(
             sub_workload.terminate()
     finally:
         pub_workload.terminate()
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_snap_files(
+    pg_bin: PgBin,
+    benchmark_project_pub: NeonApiEndpoint,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Creates a node with a replication slot. Generates pgbench into the replication slot,
+    then runs pgbench inserts while generating large numbers of snapfiles. Then restarts
+    the node and tries to peek the replication changes.
+    """
+    test_duration_min = 60
+    test_interval_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    env = benchmark_project_pub.pgbench_env
+    connstr = benchmark_project_pub.connstr
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
+
+    with psycopg2.connect(connstr) as conn:
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
+            is_super = cur.fetchall()[0]
+            assert is_super, "This benchmark won't work if we don't have superuser"
+
+    conn = psycopg2.connect(connstr)
+    conn.autocommit = True
+    cur = conn.cursor()
+    cur.execute("ALTER SYSTEM SET neon.logical_replication_max_snap_files = -1")
+
+    with psycopg2.connect(connstr) as conn:
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            cur.execute("SELECT pg_reload_conf()")
+
+    with psycopg2.connect(connstr) as conn:
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                DO $$
+                    BEGIN
+                    IF EXISTS (
+                        SELECT 1
+                        FROM pg_replication_slots
+                        WHERE slot_name = 'slotter'
+                    ) THEN
+                        PERFORM pg_drop_replication_slot('slotter');
+                    END IF;
+                END $$;
+            """
+            )
+            cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
+
+    workload = pg_bin.run_nonblocking(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env)
+    try:
+        start = time.time()
+        prev_measurement = time.time()
+        while time.time() - start < test_duration_min * 60:
+            with psycopg2.connect(connstr) as conn:
+                with conn.cursor() as cur:
+                    cur.execute(
+                        "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
+                    )
+                    check_pgbench_still_running(workload)
+                    cur.execute(
+                        "SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())"
+                    )
+
+            # Measure storage
+            if time.time() - prev_measurement > test_interval_min * 60:
+                storage = benchmark_project_pub.get_synthetic_storage_size()
+                zenbenchmark.record("storage", storage, "B", MetricReport.LOWER_IS_BETTER)
+                prev_measurement = time.time()
+            time.sleep(test_interval_min * 60 / 3)
+
+    finally:
+        workload.terminate()

From 4763a960d103a27250eadd6892368ae77a3d66c4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 16 Aug 2024 08:10:05 +0300
Subject: [PATCH 54/62] chore: log if we have an open layer or any frozen on
 shutdown (#8740)

Some benchmarks are failing with a "long" flushing, which might be
because there is a queue of in-memory layers, or something else. Add
logging to narrow it down.

Private slack DM ref:
https://neondb.slack.com/archives/D049K7HJ9JM/p1723727305238099
---
 pageserver/src/tenant/timeline.rs | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b4d908b130..01e77fa1b1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1645,6 +1645,20 @@ impl Timeline {
         self.last_record_lsn.shutdown();
 
         if try_freeze_and_flush {
+            if let Some((open, frozen)) = self
+                .layers
+                .read()
+                .await
+                .layer_map()
+                .map(|lm| (lm.open_layer.is_some(), lm.frozen_layers.len()))
+                .ok()
+                .filter(|(open, frozen)| *open || *frozen > 0)
+            {
+                tracing::info!(?open, frozen, "flushing and freezing on shutdown");
+            } else {
+                // this is double-shutdown, ignore it
+            }
+
             // we shut down walreceiver above, so, we won't add anything more
             // to the InMemoryLayer; freeze it and wait for all frozen layers
             // to reach the disk & upload queue, then shut the upload queue and

From 7fdc3ea16296ae7ac6f74ed2843ecee454391276 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 16 Aug 2024 13:30:53 +0300
Subject: [PATCH 55/62] Add retroactive RFC about physical replication (#8546)

We've had physical replication support for a long time, but we never
created an RFC for the feature. This RFC does that after the fact. Even
though we've already implemented the feature, let's have a design
discussion as if it hadn't done that. It can still be quite insightful.

This is written from a pretty compute-centric viewpoint, not much
on how it works in the control plane.
---
 docs/rfcs/036-physical-replication.md | 265 ++++++++++++++++++++++++++
 1 file changed, 265 insertions(+)
 create mode 100644 docs/rfcs/036-physical-replication.md

diff --git a/docs/rfcs/036-physical-replication.md b/docs/rfcs/036-physical-replication.md
new file mode 100644
index 0000000000..41aced0545
--- /dev/null
+++ b/docs/rfcs/036-physical-replication.md
@@ -0,0 +1,265 @@
+# Physical Replication
+
+This RFC is a bit special in that we have already implemented physical
+replication a long time ago. However, we never properly wrote down all
+the decisions and assumptions, and in the last months when more users
+have started to use the feature, numerous issues have surfaced.
+
+This RFC documents the design decisions that have been made.
+
+## Summary
+
+PostgreSQL has a feature called streaming replication, where a replica
+streams WAL from the primary and continuously applies it. It is also
+known as "physical replication", to distinguish it from logical
+replication.  In PostgreSQL, a replica is initialized by taking a
+physical backup of the primary. In Neon, the replica is initialized
+from a slim "base backup" from the pageserver, just like a primary,
+and the primary and the replicas connect to the same pageserver,
+sharing the storage.
+
+There are two kinds of read-only replicas in Neon:
+- replicas that follow the primary, and
+- "static" replicas that are pinned at a particular LSN.
+
+A static replica is useful e.g. for performing time-travel queries and
+running one-off slow queries without affecting the primary. A replica
+that follows the primary can be used e.g. to scale out read-only
+workloads.
+
+## Motivation
+
+Read-only replicas allow offloading read-only queries. It's useful for
+isolation, if you want to make sure that read-only queries don't
+affect the primary, and it's also an easy way to provide guaranteed
+read-only access to an application, without having to mess with access
+controls.
+
+## Non Goals (if relevant)
+
+This RFC is all about WAL-based *physical* replication. Logical
+replication is a different feature.
+
+Neon also has the capability to launch "static" read-only nodes which
+do not follow the primary, but are pinned to a particular LSN. They
+can be used for long-running one-off queries, or for Point-in-time
+queries. They work similarly to read replicas that follow the primary,
+but some things are simpler: there are no concerns about cache
+invalidation when the data changes on the primary, or worrying about
+transactions that are in-progress on the primary.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+- Control plane launches the replica
+- Replica Postgres instance connects to the safekeepers, to stream the WAL
+- The primary does not know about the standby, except for the hot standby feedback
+- The primary and replicas all connect to the same pageservers
+
+
+# Context
+
+Some useful things to know about hot standby and replicas in
+PostgreSQL.
+
+## PostgreSQL startup sequence
+
+"Running" and "start up" terms are little imprecise. PostgreSQL
+replica startup goes through several stages:
+
+1. First, the process is started up, and various initialization steps
+   are performed, like initializing shared memory. If you try to
+   connect to the server in this stage, you get an error: ERROR: the
+   database system is starting up. This stage happens very quickly, no
+
+2. Then the server reads the checpoint record from the WAL and starts
+   the WAL replay starting from the checkpoint. This works differently
+   in Neon: we start the WAL replay at the basebackup LSN, not from a
+   checkpoint! If you connect to the server in this state, you get an
+   error: ERROR: the database system is not yet accepting
+   connections. We proceed to the next stage, when the WAL replay sees
+   a running-xacts record. Or in Neon, the "CLOG scanning" mechanism
+   can allow us to move directly to next stage, with all the caveats
+   listed in this RFC.
+
+3. When the running-xacts information is established, the server
+   starts to accept connections normally.
+
+From PostgreSQL's point of view, the server is already running in
+stage 2, even though it's not accepting connections yet. Our
+`compute_ctl` does not consider it as running until stage 3. If the
+transition from stage 2 to 3 doesn't happen fast enough, the control
+plane will mark the start operation as failed.
+
+
+## Decisions, Issues
+
+### Cache invalidation in replica
+
+When a read replica follows the primary in PostgreSQL, it needs to
+stream all the WAL from the primary and apply all the records, to keep
+the local copy of the data consistent with the primary. In Neon, the
+replica can fetch the updated page versions from the pageserver, so
+it's not necessary to apply all the WAL. However, it needs to ensure
+that any pages that are currently in the Postgres buffer cache, or the
+Local File Cache, are either updated, or thrown away so that the next
+read of the page will fetch the latest version.
+
+We choose to apply the WAL records for pages that are already in the
+buffer cache, and skip records for other pages. Somewhat arbitrarily,
+we also apply records affecting catalog relations, fetching the old
+page version from the pageserver if necessary first. See
+`neon_redo_read_buffer_filter()` function.
+
+The replica wouldn't necessarily need to see all the WAL records, only
+the records that apply to cached pages. For simplicity, we do stream
+all the WAL to the replica, and the replica simply ignores WAL records
+that require no action.
+
+Like in PostgreSQL, the read replica maintains a "replay LSN", which
+is the LSN up to which the replica has received and replayed the
+WAL. The replica can lag behind the primary, if it cannot quite keep
+up with the primary, or if a long-running query conflicts with changes
+that are about to be applied, or even intentionally if the user wishes
+to see delayed data (see recovery_min_apply_delay). It's important
+that the replica sees a consistent view of the whole cluster at the
+replay LSN, when it's lagging behind.
+
+In Neon, the replica connects to a safekeeper to get the WAL
+stream. That means that the safekeepers must be able to regurgitate
+the original WAL as far back as the replay LSN of any running read
+replica. (A static read-only node that does not follow the primary
+does not require a WAL stream however). The primary does not need to
+be running, and when it is, the replicas don't incur any extra
+overhead to the primary (see hot standby feedback though).
+
+### In-progress transactions
+
+In PostgreSQL, when a hot standby server starts up, it cannot
+immediately open up for queries (see [PostgreSQL startup
+sequence]). It first needs to establish a complete list of in-progress
+transactions, including subtransactions, that are running at the
+primary, at the current replay LSN. Normally that happens quickly,
+when the replica sees a "running-xacts" WAL record, because the
+primary writes a running-xacts WAL record at every checkpoint, and in
+PostgreSQL the replica always starts the WAL replay from a checkpoint
+REDO point. (A shutdown checkpoint WAL record also implies that all
+the non-prepared transactions have ended.) If there are a lot of
+subtransactions in progress, however, the standby might need to wait
+for old transactions to complete before it can open up for queries.
+
+In Neon that problem is worse: a replica can start at any LSN, so
+there's no guarantee that it will see a running-xacts record any time
+soon. In particular, if the primary is not running when the replica is
+started, it might never see a running-xacts record.
+
+To make things worse, we initially missed this issue, and always
+started accepting queries at replica startup, even if it didn't have
+the transaction information. That could lead to incorrect query
+results and data corruption later. However, as we fixed that, we
+introduced a new problem compared to what we had before: previously
+the replica would always start up, but after fixing that bug, it might
+not. In a superficial way, the old behavior was better (but could lead
+to serious issues later!). That made fixing that bug was very hard,
+because as we fixed it, we made things (superficially) worse for
+others.
+
+See https://github.com/neondatabase/neon/pull/7288 which fixed the
+bug, and follow-up PRs https://github.com/neondatabase/neon/pull/8323
+and https://github.com/neondatabase/neon/pull/8484 to try to claw back
+the cases that started to cause trouble as fixing it. As of this
+writing, there are still cases where a replica might not immediately
+start up, causing the control plane operation to fail, the remaining
+issues are tracked in https://github.com/neondatabase/neon/issues/6211.
+
+One long-term fix for this is to switch to using so-called CSN
+snapshots in read replica. That would make it unnecessary to have the
+full in-progress transaction list in the replica at startup time. See
+https://commitfest.postgresql.org/48/4912/ for a work-in-progress
+patch to upstream to implement that.
+
+Another thing we could do is to teach the control plane about that
+distinction between "starting up" and "running but haven't received
+running-xacts information yet", so that we could keep the replica
+waiting longer in that stage, and also give any client connections the
+same `ERROR: the database system is not yet accepting connections`
+error that you get in standalone PostgreSQL in that state.
+
+
+### Recovery conflicts and Hot standby feedback
+
+It's possible that a tuple version is vacuumed away in the primary,
+even though it is still needed by a running transactions in the
+replica. This is called a "recovery conflict", and PostgreSQL provides
+various options for dealing with it. By default, the WAL replay will
+wait up to 30 s for the conflicting query to finish. After that, it
+will kill the running query, so that the WAL replay can proceed.
+
+Another way to avoid the situation is to enable the
+[`hot_standby_feedback`](https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY-FEEDBACK)
+option. When it is enabled, the primary will refrain from vacuuming
+tuples that are still needed in the primary. That means potentially
+bloating the primary, which violates the usual rule that read replicas
+don't affect the operations on the primary, which is why it's off by
+default. We leave it to users to decide if they want to turn it on,
+same as PostgreSQL.
+
+Neon supports `hot_standby_feedback` by passing the feedback messages
+from the replica to the safekeepers, and from safekeepers to the
+primary.
+
+### Relationship of settings between primary and replica
+
+In order to enter hot standby mode, some configuration options need to
+be set to the same or larger values in the standby, compared to the
+primary.  See [explanation in the PostgreSQL
+docs](https://www.postgresql.org/docs/current/hot-standby.html#HOT-STANDBY-ADMIN)
+
+In Neon, we have this problem too. To prevent customers from hitting
+it, the control plane automatically adjusts the settings of a replica,
+so that they match or exceed the primary's settings (see
+https://github.com/neondatabase/cloud/issues/14903). However, you
+can still hit the issue if the primary is restarted with larger
+settings, while the replica is running.
+
+
+### Interaction with Pageserver GC
+
+The read replica can lag behind the primary. If there are recovery
+conflicts or the replica cannot keep up for some reason, the lag can
+in principle grow indefinitely. The replica will issue all GetPage
+requests to the pageservers at the current replay LSN, and needs to
+see the old page versions.
+
+If the retention period in the pageserver is set to be small, it may
+have already garbage collected away the old page versions. That will
+cause read errors in the compute, and can mean that the replica cannot
+make progress with the replication anymore.
+
+There is a mechanism for replica to pass information about its replay
+LSN to the pageserver, so that the pageserver refrains from GC'ing
+data that is still needed by the standby. It's called
+'standby_horizon' in the pageserver code, see
+https://github.com/neondatabase/neon/pull/7368. A separate "lease"
+mechanism also is in the works, where the replica could hold a lease
+on the old LSN, preventing the pageserver from advancing the GC
+horizon past that point. The difference is that the standby_horizon
+mechanism relies on a feedback message from replica to safekeeper,
+while the least API is exposed directly from the pageserver. A static
+read-only node is not connected to safekeepers, so it cannot use the
+standby_horizon mechanism.
+
+
+### Synchronous replication
+
+We haven't put any effort into synchronous replication yet.
+
+PostgreSQL provides multiple levels of synchronicity. In the weaker
+levels, a transaction is not acknowledged as committed to the client
+in the primary until the WAL has been streamed to a replica or flushed
+to disk there. Those modes don't make senses in Neon, because the
+safekeepers handle durability.
+
+`synchronous_commit=remote_apply` mode would make sense. In that mode,
+the commit is not acknowledged to the client until it has been
+replayed in the replica. That ensures that after commit, you can see
+the commit in the replica too (aka. read-your-write consistency).

From 3f91ea28d997a23b899ef0c3ce237e7ae85f2916 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 16 Aug 2024 13:05:04 +0100
Subject: [PATCH 56/62] tests: add infra and test for storcon leadership
 transfer (#8587)

## Problem
https://github.com/neondatabase/neon/pull/8588 implemented the mechanism
for storage controller
leadership transfers. However, there's no tests that exercise the
behaviour.

## Summary of changes
1. Teach `neon_local` how to handle multiple storage controller
instances. Each storage controller
instance gets its own subdirectory (`storage_controller_1, ...`).
`storage_controller start|stop` subcommands
have also been extended to optionally accept an instance id.
2. Add a storage controller proxy test fixture. It's a basic HTTP server
that forwards requests from pageserver
and test env to the currently configured storage controller.
3. Add a test which exercises storage controller leadership transfer.
4. Finally fix a couple bugs that the test surfaced
---
 control_plane/src/background_process.rs       |   2 +-
 control_plane/src/bin/neon_local.rs           |  86 +++-
 control_plane/src/local_env.rs                |  37 ++
 control_plane/src/storage_controller.rs       | 396 ++++++++++++------
 storage_controller/src/http.rs                |  16 +
 storage_controller/src/peer_client.rs         |   4 +-
 storage_controller/src/service.rs             | 114 ++---
 test_runner/conftest.py                       |   1 +
 test_runner/fixtures/neon_fixtures.py         | 232 +++++++---
 .../fixtures/storage_controller_proxy.py      |  73 ++++
 test_runner/fixtures/utils.py                 |   2 +-
 .../regress/test_storage_controller.py        | 129 ++++++
 12 files changed, 841 insertions(+), 251 deletions(-)
 create mode 100644 test_runner/fixtures/storage_controller_proxy.py

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index bf8a27e550..619c5bce3e 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -379,7 +379,7 @@ where
     }
 }
 
-fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
+pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
     match kill(pid, None) {
         // Process exists, keep waiting
         Ok(_) => Ok(false),
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 51e9a51a57..edd88dc71c 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -15,7 +15,9 @@ use control_plane::local_env::{
 };
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
-use control_plane::storage_controller::StorageController;
+use control_plane::storage_controller::{
+    NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
+};
 use control_plane::{broker, local_env};
 use pageserver_api::config::{
     DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
@@ -1052,6 +1054,36 @@ fn get_start_timeout(args: &ArgMatches) -> &Duration {
     humantime_duration.as_ref()
 }
 
+fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs {
+    let maybe_instance_id = args.get_one::<u8>("instance-id");
+
+    let base_port = args.get_one::<u16>("base-port");
+
+    if maybe_instance_id.is_some() && base_port.is_none() {
+        panic!("storage-controller start specificied instance-id but did not provide base-port");
+    }
+
+    let start_timeout = args
+        .get_one::<humantime::Duration>("start-timeout")
+        .expect("invalid value for start-timeout");
+
+    NeonStorageControllerStartArgs {
+        instance_id: maybe_instance_id.copied().unwrap_or(1),
+        base_port: base_port.copied(),
+        start_timeout: *start_timeout,
+    }
+}
+
+fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs {
+    let maybe_instance_id = args.get_one::<u8>("instance-id");
+    let immediate = args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
+
+    NeonStorageControllerStopArgs {
+        instance_id: maybe_instance_id.copied().unwrap_or(1),
+        immediate,
+    }
+}
+
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     match sub_match.subcommand() {
         Some(("start", subcommand_args)) => {
@@ -1113,19 +1145,14 @@ async fn handle_storage_controller(
     let svc = StorageController::from_env(env);
     match sub_match.subcommand() {
         Some(("start", start_match)) => {
-            if let Err(e) = svc.start(get_start_timeout(start_match)).await {
+            if let Err(e) = svc.start(storage_controller_start_args(start_match)).await {
                 eprintln!("start failed: {e}");
                 exit(1);
             }
         }
 
         Some(("stop", stop_match)) => {
-            let immediate = stop_match
-                .get_one::<String>("stop-mode")
-                .map(|s| s.as_str())
-                == Some("immediate");
-
-            if let Err(e) = svc.stop(immediate).await {
+            if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await {
                 eprintln!("stop failed: {}", e);
                 exit(1);
             }
@@ -1228,7 +1255,12 @@ async fn handle_start_all(
     // Only start the storage controller if the pageserver is configured to need it
     if env.control_plane_api.is_some() {
         let storage_controller = StorageController::from_env(env);
-        if let Err(e) = storage_controller.start(retry_timeout).await {
+        if let Err(e) = storage_controller
+            .start(NeonStorageControllerStartArgs::with_default_instance_id(
+                (*retry_timeout).into(),
+            ))
+            .await
+        {
             eprintln!("storage_controller start failed: {:#}", e);
             try_stop_all(env, true).await;
             exit(1);
@@ -1358,10 +1390,21 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
         eprintln!("neon broker stop failed: {e:#}");
     }
 
-    if env.control_plane_api.is_some() {
+    // Stop all storage controller instances. In the most common case there's only one,
+    // but iterate though the base data directory in order to discover the instances.
+    let storcon_instances = env
+        .storage_controller_instances()
+        .await
+        .expect("Must inspect data dir");
+    for (instance_id, _instance_dir_path) in storcon_instances {
         let storage_controller = StorageController::from_env(env);
-        if let Err(e) = storage_controller.stop(immediate).await {
-            eprintln!("storage controller stop failed: {e:#}");
+        let stop_args = NeonStorageControllerStopArgs {
+            instance_id,
+            immediate,
+        };
+
+        if let Err(e) = storage_controller.stop(stop_args).await {
+            eprintln!("Storage controller instance {instance_id} stop failed: {e:#}");
         }
     }
 }
@@ -1501,6 +1544,18 @@ fn cli() -> Command {
         .action(ArgAction::SetTrue)
         .required(false);
 
+    let instance_id = Arg::new("instance-id")
+        .long("instance-id")
+        .help("Identifier used to distinguish storage controller instances (default 1)")
+        .value_parser(value_parser!(u8))
+        .required(false);
+
+    let base_port = Arg::new("base-port")
+        .long("base-port")
+        .help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)")
+        .value_parser(value_parser!(u16))
+        .required(false);
+
     Command::new("Neon CLI")
         .arg_required_else_help(true)
         .version(GIT_VERSION)
@@ -1609,9 +1664,12 @@ fn cli() -> Command {
                 .arg_required_else_help(true)
                 .about("Manage storage_controller")
                 .subcommand(Command::new("start").about("Start storage controller")
-                            .arg(timeout_arg.clone()))
+                            .arg(timeout_arg.clone())
+                            .arg(instance_id.clone())
+                            .arg(base_port))
                 .subcommand(Command::new("stop").about("Stop storage controller")
-                            .arg(stop_mode_arg.clone()))
+                            .arg(stop_mode_arg.clone())
+                            .arg(instance_id))
         )
         .subcommand(
             Command::new("safekeeper")
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 15bbac702f..807519c88d 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -156,6 +156,11 @@ pub struct NeonStorageControllerConf {
     #[serde(with = "humantime_serde")]
     pub max_warming_up: Duration,
 
+    pub start_as_candidate: bool,
+
+    /// Database url used when running multiple storage controller instances
+    pub database_url: Option<SocketAddr>,
+
     /// Threshold for auto-splitting a tenant into shards
     pub split_threshold: Option<u64>,
 
@@ -174,6 +179,8 @@ impl Default for NeonStorageControllerConf {
         Self {
             max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
             max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
+            start_as_candidate: false,
+            database_url: None,
             split_threshold: None,
             max_secondary_lag_bytes: None,
         }
@@ -392,6 +399,36 @@ impl LocalEnv {
         }
     }
 
+    /// Inspect the base data directory and extract the instance id and instance directory path
+    /// for all storage controller instances
+    pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
+        let mut instances = Vec::default();
+
+        let dir = std::fs::read_dir(self.base_data_dir.clone())?;
+        for dentry in dir {
+            let dentry = dentry?;
+            let is_dir = dentry.metadata()?.is_dir();
+            let filename = dentry.file_name().into_string().unwrap();
+            let parsed_instance_id = match filename.strip_prefix("storage_controller_") {
+                Some(suffix) => suffix.parse::<u8>().ok(),
+                None => None,
+            };
+
+            let is_instance_dir = is_dir && parsed_instance_id.is_some();
+
+            if !is_instance_dir {
+                continue;
+            }
+
+            instances.push((
+                parsed_instance_id.expect("Checked previously"),
+                dentry.path(),
+            ));
+        }
+
+        Ok(instances)
+    }
+
     pub fn register_branch_mapping(
         &mut self,
         branch_name: String,
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index f180e922e8..2c077595a1 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -3,6 +3,8 @@ use crate::{
     local_env::{LocalEnv, NeonStorageControllerConf},
 };
 use camino::{Utf8Path, Utf8PathBuf};
+use hyper::Uri;
+use nix::unistd::Pid;
 use pageserver_api::{
     controller_api::{
         NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
@@ -18,7 +20,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{fs, str::FromStr, time::Duration};
+use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -29,12 +31,14 @@ use utils::{
 
 pub struct StorageController {
     env: LocalEnv,
-    listen: String,
     private_key: Option<Vec<u8>>,
     public_key: Option<String>,
-    postgres_port: u16,
     client: reqwest::Client,
     config: NeonStorageControllerConf,
+
+    // The listen addresses is learned when starting the storage controller,
+    // hence the use of OnceLock to init it at the right time.
+    listen: OnceLock<SocketAddr>,
 }
 
 const COMMAND: &str = "storage_controller";
@@ -43,6 +47,36 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 
 const DB_NAME: &str = "storage_controller";
 
+pub struct NeonStorageControllerStartArgs {
+    pub instance_id: u8,
+    pub base_port: Option<u16>,
+    pub start_timeout: humantime::Duration,
+}
+
+impl NeonStorageControllerStartArgs {
+    pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self {
+        Self {
+            instance_id: 1,
+            base_port: None,
+            start_timeout,
+        }
+    }
+}
+
+pub struct NeonStorageControllerStopArgs {
+    pub instance_id: u8,
+    pub immediate: bool,
+}
+
+impl NeonStorageControllerStopArgs {
+    pub fn with_default_instance_id(immediate: bool) -> Self {
+        Self {
+            instance_id: 1,
+            immediate,
+        }
+    }
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
@@ -67,23 +101,6 @@ pub struct InspectResponse {
 
 impl StorageController {
     pub fn from_env(env: &LocalEnv) -> Self {
-        // Makes no sense to construct this if pageservers aren't going to use it: assume
-        // pageservers have control plane API set
-        let listen_url = env.control_plane_api.clone().unwrap();
-
-        let listen = format!(
-            "{}:{}",
-            listen_url.host_str().unwrap(),
-            listen_url.port().unwrap()
-        );
-
-        // Convention: NeonEnv in python tests reserves the next port after the control_plane_api
-        // port, for use by our captive postgres.
-        let postgres_port = listen_url
-            .port()
-            .expect("Control plane API setting should always have a port")
-            + 1;
-
         // Assume all pageservers have symmetric auth configuration: this service
         // expects to use one JWT token to talk to all of them.
         let ps_conf = env
@@ -126,20 +143,28 @@ impl StorageController {
 
         Self {
             env: env.clone(),
-            listen,
             private_key,
             public_key,
-            postgres_port,
             client: reqwest::ClientBuilder::new()
                 .build()
                 .expect("Failed to construct http client"),
             config: env.storage_controller.clone(),
+            listen: OnceLock::default(),
         }
     }
 
-    fn pid_file(&self) -> Utf8PathBuf {
-        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
-            .expect("non-Unicode path")
+    fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf {
+        self.env
+            .base_data_dir
+            .join(format!("storage_controller_{}", instance_id))
+    }
+
+    fn pid_file(&self, instance_id: u8) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(
+            self.storage_controller_instance_dir(instance_id)
+                .join("storage_controller.pid"),
+        )
+        .expect("non-Unicode path")
     }
 
     /// PIDFile for the postgres instance used to store storage controller state
@@ -184,9 +209,9 @@ impl StorageController {
     }
 
     /// Readiness check for our postgres process
-    async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
+    async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
         let bin_path = pg_bin_dir.join("pg_isready");
-        let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
+        let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)];
         let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
 
         Ok(exitcode.success())
@@ -199,8 +224,8 @@ impl StorageController {
     /// who just want to run `cargo neon_local` without knowing about diesel.
     ///
     /// Returns the database url
-    pub async fn setup_database(&self) -> anyhow::Result<String> {
-        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
+    pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
+        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
 
         let pg_bin_dir = self.get_pg_bin_dir().await?;
         let createdb_path = pg_bin_dir.join("createdb");
@@ -209,7 +234,7 @@ impl StorageController {
                 "-h",
                 "localhost",
                 "-p",
-                &format!("{}", self.postgres_port),
+                &format!("{}", postgres_port),
                 DB_NAME,
             ])
             .output()
@@ -230,13 +255,14 @@ impl StorageController {
 
     pub async fn connect_to_database(
         &self,
+        postgres_port: u16,
     ) -> anyhow::Result<(
         tokio_postgres::Client,
         tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
     )> {
         tokio_postgres::Config::new()
             .host("localhost")
-            .port(self.postgres_port)
+            .port(postgres_port)
             // The user is the ambient operating system user name.
             // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
             //
@@ -252,72 +278,115 @@ impl StorageController {
             .map_err(anyhow::Error::new)
     }
 
-    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
-        // Start a vanilla Postgres process used by the storage controller for persistence.
-        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
-            .unwrap()
-            .join("storage_controller_db");
-        let pg_bin_dir = self.get_pg_bin_dir().await?;
-        let pg_lib_dir = self.get_pg_lib_dir().await?;
-        let pg_log_path = pg_data_path.join("postgres.log");
+    pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
+        let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
+        if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
+            if err.kind() != std::io::ErrorKind::AlreadyExists {
+                panic!("Failed to create instance dir {instance_dir:?}");
+            }
+        }
 
-        if !tokio::fs::try_exists(&pg_data_path).await? {
-            // Initialize empty database
-            let initdb_path = pg_bin_dir.join("initdb");
-            let mut child = Command::new(&initdb_path)
-                .envs(vec![
-                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ])
-                .args(["-D", pg_data_path.as_ref()])
-                .spawn()
-                .expect("Failed to spawn initdb");
-            let status = child.wait().await?;
-            if !status.success() {
-                anyhow::bail!("initdb failed with status {status}");
+        let (listen, postgres_port) = {
+            if let Some(base_port) = start_args.base_port {
+                (
+                    format!("127.0.0.1:{base_port}"),
+                    self.config
+                        .database_url
+                        .expect("--base-port requires NeonStorageControllerConf::database_url")
+                        .port(),
+                )
+            } else {
+                let listen_url = self.env.control_plane_api.clone().unwrap();
+
+                let listen = format!(
+                    "{}:{}",
+                    listen_url.host_str().unwrap(),
+                    listen_url.port().unwrap()
+                );
+
+                (listen, listen_url.port().unwrap() + 1)
             }
         };
 
-        // Write a minimal config file:
-        // - Specify the port, since this is chosen dynamically
-        // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
-        //   the storage controller we don't want a slow local disk to interfere with that.
-        //
-        // NB: it's important that we rewrite this file on each start command so we propagate changes
-        // from `LocalEnv`'s config file (`.neon/config`).
-        tokio::fs::write(
-            &pg_data_path.join("postgresql.conf"),
-            format!("port = {}\nfsync=off\n", self.postgres_port),
-        )
-        .await?;
+        let socket_addr = listen
+            .parse()
+            .expect("listen address is a valid socket address");
+        self.listen
+            .set(socket_addr)
+            .expect("StorageController::listen is only set here");
 
-        println!("Starting storage controller database...");
-        let db_start_args = [
-            "-w",
-            "-D",
-            pg_data_path.as_ref(),
-            "-l",
-            pg_log_path.as_ref(),
-            "start",
-        ];
+        // Do we remove the pid file on stop?
+        let pg_started = self.is_postgres_running().await?;
+        let pg_lib_dir = self.get_pg_lib_dir().await?;
 
-        background_process::start_process(
-            "storage_controller_db",
-            &self.env.base_data_dir,
-            pg_bin_dir.join("pg_ctl").as_std_path(),
-            db_start_args,
-            vec![
-                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-            ],
-            background_process::InitialPidFile::Create(self.postgres_pid_file()),
-            retry_timeout,
-            || self.pg_isready(&pg_bin_dir),
-        )
-        .await?;
+        if !pg_started {
+            // Start a vanilla Postgres process used by the storage controller for persistence.
+            let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
+                .unwrap()
+                .join("storage_controller_db");
+            let pg_bin_dir = self.get_pg_bin_dir().await?;
+            let pg_log_path = pg_data_path.join("postgres.log");
 
-        // Run migrations on every startup, in case something changed.
-        let database_url = self.setup_database().await?;
+            if !tokio::fs::try_exists(&pg_data_path).await? {
+                // Initialize empty database
+                let initdb_path = pg_bin_dir.join("initdb");
+                let mut child = Command::new(&initdb_path)
+                    .envs(vec![
+                        ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                        ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                    ])
+                    .args(["-D", pg_data_path.as_ref()])
+                    .spawn()
+                    .expect("Failed to spawn initdb");
+                let status = child.wait().await?;
+                if !status.success() {
+                    anyhow::bail!("initdb failed with status {status}");
+                }
+            };
+
+            // Write a minimal config file:
+            // - Specify the port, since this is chosen dynamically
+            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
+            //   the storage controller we don't want a slow local disk to interfere with that.
+            //
+            // NB: it's important that we rewrite this file on each start command so we propagate changes
+            // from `LocalEnv`'s config file (`.neon/config`).
+            tokio::fs::write(
+                &pg_data_path.join("postgresql.conf"),
+                format!("port = {}\nfsync=off\n", postgres_port),
+            )
+            .await?;
+
+            println!("Starting storage controller database...");
+            let db_start_args = [
+                "-w",
+                "-D",
+                pg_data_path.as_ref(),
+                "-l",
+                pg_log_path.as_ref(),
+                "start",
+            ];
+
+            background_process::start_process(
+                "storage_controller_db",
+                &self.env.base_data_dir,
+                pg_bin_dir.join("pg_ctl").as_std_path(),
+                db_start_args,
+                vec![
+                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ],
+                background_process::InitialPidFile::Create(self.postgres_pid_file()),
+                &start_args.start_timeout,
+                || self.pg_isready(&pg_bin_dir, postgres_port),
+            )
+            .await?;
+
+            // Run migrations on every startup, in case something changed.
+            self.setup_database(postgres_port).await?;
+        }
+
+        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
 
         // We support running a startup SQL script to fiddle with the database before we launch storcon.
         // This is used by the test suite.
@@ -339,7 +408,7 @@ impl StorageController {
                 }
             }
         };
-        let (mut client, conn) = self.connect_to_database().await?;
+        let (mut client, conn) = self.connect_to_database(postgres_port).await?;
         let conn = tokio::spawn(conn);
         let tx = client.build_transaction();
         let tx = tx.start().await?;
@@ -348,9 +417,20 @@ impl StorageController {
         drop(client);
         conn.await??;
 
+        let listen = self
+            .listen
+            .get()
+            .expect("cell is set earlier in this function");
+        let address_for_peers = Uri::builder()
+            .scheme("http")
+            .authority(format!("{}:{}", listen.ip(), listen.port()))
+            .path_and_query("")
+            .build()
+            .unwrap();
+
         let mut args = vec![
             "-l",
-            &self.listen,
+            &listen.to_string(),
             "--dev",
             "--database-url",
             &database_url,
@@ -358,10 +438,17 @@ impl StorageController {
             &humantime::Duration::from(self.config.max_offline).to_string(),
             "--max-warming-up-interval",
             &humantime::Duration::from(self.config.max_warming_up).to_string(),
+            "--address-for-peers",
+            &address_for_peers.to_string(),
         ]
         .into_iter()
         .map(|s| s.to_string())
         .collect::<Vec<_>>();
+
+        if self.config.start_as_candidate {
+            args.push("--start-as-candidate".to_string());
+        }
+
         if let Some(private_key) = &self.private_key {
             let claims = Claims::new(None, Scope::PageServerApi);
             let jwt_token =
@@ -394,15 +481,15 @@ impl StorageController {
 
         background_process::start_process(
             COMMAND,
-            &self.env.base_data_dir,
+            &instance_dir,
             &self.env.storage_controller_bin(),
             args,
             vec![
                 ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
                 ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
             ],
-            background_process::InitialPidFile::Create(self.pid_file()),
-            retry_timeout,
+            background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
+            &start_args.start_timeout,
             || async {
                 match self.ready().await {
                     Ok(_) => Ok(true),
@@ -415,8 +502,35 @@ impl StorageController {
         Ok(())
     }
 
-    pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
+    pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> {
+        background_process::stop_process(
+            stop_args.immediate,
+            COMMAND,
+            &self.pid_file(stop_args.instance_id),
+        )?;
+
+        let storcon_instances = self.env.storage_controller_instances().await?;
+        for (instance_id, instanced_dir_path) in storcon_instances {
+            if instance_id == stop_args.instance_id {
+                continue;
+            }
+
+            let pid_file = instanced_dir_path.join("storage_controller.pid");
+            let pid = tokio::fs::read_to_string(&pid_file)
+                .await
+                .map_err(|err| {
+                    anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}")
+                })?
+                .parse::<i32>()
+                .expect("pid is valid i32");
+
+            let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?;
+            if other_proc_alive {
+                // There is another storage controller instance running, so we return
+                // and leave the database running.
+                return Ok(());
+            }
+        }
 
         let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
         let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -429,27 +543,51 @@ impl StorageController {
             .wait()
             .await?;
         if !stop_status.success() {
-            let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
-            let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
-                .args(pg_status_args)
-                .spawn()?
-                .wait()
-                .await?;
-
-            // pg_ctl status returns this exit code if postgres is not running: in this case it is
-            // fine that stop failed.  Otherwise it is an error that stop failed.
-            const PG_STATUS_NOT_RUNNING: i32 = 3;
-            if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
-                println!("Storage controller database is already stopped");
-                return Ok(());
-            } else {
-                anyhow::bail!("Failed to stop storage controller database: {stop_status}")
+            match self.is_postgres_running().await {
+                Ok(false) => {
+                    println!("Storage controller database is already stopped");
+                    return Ok(());
+                }
+                Ok(true) => {
+                    anyhow::bail!("Failed to stop storage controller database");
+                }
+                Err(err) => {
+                    anyhow::bail!("Failed to stop storage controller database: {err}");
+                }
             }
         }
 
         Ok(())
     }
 
+    async fn is_postgres_running(&self) -> anyhow::Result<bool> {
+        let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+
+        let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
+        let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
+            .args(pg_status_args)
+            .spawn()?
+            .wait()
+            .await?;
+
+        // pg_ctl status returns this exit code if postgres is not running: in this case it is
+        // fine that stop failed.  Otherwise it is an error that stop failed.
+        const PG_STATUS_NOT_RUNNING: i32 = 3;
+        const PG_NO_DATA_DIR: i32 = 4;
+        const PG_STATUS_RUNNING: i32 = 0;
+        match status_exitcode.code() {
+            Some(PG_STATUS_NOT_RUNNING) => Ok(false),
+            Some(PG_NO_DATA_DIR) => Ok(false),
+            Some(PG_STATUS_RUNNING) => Ok(true),
+            Some(code) => Err(anyhow::anyhow!(
+                "pg_ctl status returned unexpected status code: {:?}",
+                code
+            )),
+            None => Err(anyhow::anyhow!("pg_ctl status returned no status code")),
+        }
+    }
+
     fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
         let category = match path.find('/') {
             Some(idx) => &path[..idx],
@@ -475,15 +613,31 @@ impl StorageController {
         RQ: Serialize + Sized,
         RS: DeserializeOwned + Sized,
     {
-        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
-        // for general purpose API access.
-        let listen_url = self.env.control_plane_api.clone().unwrap();
-        let url = Url::from_str(&format!(
-            "http://{}:{}/{path}",
-            listen_url.host_str().unwrap(),
-            listen_url.port().unwrap()
-        ))
-        .unwrap();
+        // In the special case of the `storage_controller start` subcommand, we wish
+        // to use the API endpoint of the newly started storage controller in order
+        // to pass the readiness check. In this scenario [`Self::listen`] will be set
+        // (see [`Self::start`]).
+        //
+        // Otherwise, we infer the storage controller api endpoint from the configured
+        // control plane API.
+        let url = if let Some(socket_addr) = self.listen.get() {
+            Url::from_str(&format!(
+                "http://{}:{}/{path}",
+                socket_addr.ip().to_canonical(),
+                socket_addr.port()
+            ))
+            .unwrap()
+        } else {
+            // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+            // for general purpose API access.
+            let listen_url = self.env.control_plane_api.clone().unwrap();
+            Url::from_str(&format!(
+                "http://{}:{}/{path}",
+                listen_url.host_str().unwrap(),
+                listen_url.port().unwrap()
+            ))
+            .unwrap()
+        };
 
         let mut builder = self.client.request(method, url);
         if let Some(body) = body {
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index e755aaed19..7bbd1541cf 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -520,6 +520,19 @@ async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiErr
     json_response(StatusCode::OK, node_status)
 }
 
+async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let leader = state.service.get_leader().await.map_err(|err| {
+        ApiError::InternalServerError(anyhow::anyhow!(
+            "Failed to read leader from database: {err}"
+        ))
+    })?;
+
+    json_response(StatusCode::OK, leader)
+}
+
 async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -1016,6 +1029,9 @@ pub fn make_router(
         .get("/control/v1/node/:node_id", |r| {
             named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
         })
+        .get("/control/v1/leader", |r| {
+            named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
+        })
         .put("/control/v1/node/:node_id/drain", |r| {
             named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
         })
diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs
index ebb59a1720..3f8520fe55 100644
--- a/storage_controller/src/peer_client.rs
+++ b/storage_controller/src/peer_client.rs
@@ -1,7 +1,7 @@
 use crate::tenant_shard::ObservedState;
 use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
+use std::{collections::HashMap, time::Duration};
 use tokio_util::sync::CancellationToken;
 
 use hyper::Uri;
@@ -69,6 +69,8 @@ impl PeerClient {
             req
         };
 
+        let req = req.timeout(Duration::from_secs(2));
+
         let res = req
             .send()
             .await
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 84db088a42..3459b44774 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -20,7 +20,8 @@ use crate::{
     metrics,
     peer_client::{GlobalObservedState, PeerClient},
     persistence::{
-        AbortShardSplitStatus, ControllerPersistence, MetadataHealthPersistence, TenantFilter,
+        AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
+        TenantFilter,
     },
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
     scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
@@ -489,11 +490,6 @@ pub(crate) enum ReconcileResultRequest {
     Stop,
 }
 
-struct LeaderStepDownState {
-    observed: GlobalObservedState,
-    leader: ControllerPersistence,
-}
-
 impl Service {
     pub fn get_config(&self) -> &Config {
         &self.config
@@ -504,7 +500,8 @@ impl Service {
     #[instrument(skip_all)]
     async fn startup_reconcile(
         self: &Arc<Service>,
-        leader_step_down_state: Option<LeaderStepDownState>,
+        current_leader: Option<ControllerPersistence>,
+        leader_step_down_state: Option<GlobalObservedState>,
         bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
             Result<(), (TenantShardId, NotifyError)>,
         >,
@@ -522,17 +519,15 @@ impl Service {
             .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
             .expect("Reconcile timeout is a modest constant");
 
-        let (observed, current_leader) = if let Some(state) = leader_step_down_state {
+        let observed = if let Some(state) = leader_step_down_state {
             tracing::info!(
                 "Using observed state received from leader at {}",
-                state.leader.address,
+                current_leader.as_ref().unwrap().address
             );
-            (state.observed, Some(state.leader))
+
+            state
         } else {
-            (
-                self.build_global_observed_state(node_scan_deadline).await,
-                None,
-            )
+            self.build_global_observed_state(node_scan_deadline).await
         };
 
         // Accumulate a list of any tenant locations that ought to be detached
@@ -1382,13 +1377,32 @@ impl Service {
                 };
 
                 let leadership_status = this.inner.read().unwrap().get_leadership_status();
-                let peer_observed_state = match leadership_status {
-                    LeadershipStatus::Candidate => this.request_step_down().await,
+                let leader = match this.get_leader().await {
+                    Ok(ok) => ok,
+                    Err(err) => {
+                        tracing::error!(
+                            "Failed to query database for current leader: {err}. Aborting start-up ..."
+                        );
+                        std::process::exit(1);
+                    }
+                };
+
+                let leader_step_down_state = match leadership_status {
+                    LeadershipStatus::Candidate => {
+                        if let Some(ref leader) = leader {
+                            this.request_step_down(leader).await
+                        } else {
+                            tracing::info!(
+                                "No leader found to request step down from. Will build observed state."
+                            );
+                            None
+                        }
+                    }
                     LeadershipStatus::Leader => None,
                     LeadershipStatus::SteppedDown => unreachable!(),
                 };
 
-                this.startup_reconcile(peer_observed_state, bg_compute_notify_result_tx)
+                this.startup_reconcile(leader, leader_step_down_state, bg_compute_notify_result_tx)
                     .await;
 
                 drop(startup_completion);
@@ -4650,6 +4664,10 @@ impl Service {
             ))
     }
 
+    pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
+        self.persistence.get_leader().await
+    }
+
     pub(crate) async fn node_register(
         &self,
         register_req: NodeRegisterRequest,
@@ -6342,6 +6360,7 @@ impl Service {
 
     pub(crate) async fn step_down(&self) -> GlobalObservedState {
         tracing::info!("Received step down request from peer");
+        failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");
 
         self.inner.write().unwrap().step_down();
         // TODO: would it make sense to have a time-out for this?
@@ -6367,50 +6386,31 @@ impl Service {
     ///
     /// On failures to query the database or step down error responses the process is killed
     /// and we rely on k8s to retry.
-    async fn request_step_down(&self) -> Option<LeaderStepDownState> {
-        let leader = match self.persistence.get_leader().await {
-            Ok(leader) => leader,
+    async fn request_step_down(
+        &self,
+        leader: &ControllerPersistence,
+    ) -> Option<GlobalObservedState> {
+        tracing::info!("Sending step down request to {leader:?}");
+
+        // TODO: jwt token
+        let client = PeerClient::new(
+            Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
+            self.config.jwt_token.clone(),
+        );
+        let state = client.step_down(&self.cancel).await;
+        match state {
+            Ok(state) => Some(state),
             Err(err) => {
+                // TODO: Make leaders periodically update a timestamp field in the
+                // database and, if the leader is not reachable from the current instance,
+                // but inferred as alive from the timestamp, abort start-up. This avoids
+                // a potential scenario in which we have two controllers acting as leaders.
                 tracing::error!(
-                    "Failed to query database for current leader: {err}. Aborting start-up ..."
+                    "Leader ({}) did not respond to step-down request: {}",
+                    leader.address,
+                    err
                 );
-                std::process::exit(1);
-            }
-        };
 
-        match leader {
-            Some(leader) => {
-                tracing::info!("Sending step down request to {leader:?}");
-
-                // TODO: jwt token
-                let client = PeerClient::new(
-                    Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
-                    self.config.jwt_token.clone(),
-                );
-                let state = client.step_down(&self.cancel).await;
-                match state {
-                    Ok(state) => Some(LeaderStepDownState {
-                        observed: state,
-                        leader: leader.clone(),
-                    }),
-                    Err(err) => {
-                        // TODO: Make leaders periodically update a timestamp field in the
-                        // database and, if the leader is not reachable from the current instance,
-                        // but inferred as alive from the timestamp, abort start-up. This avoids
-                        // a potential scenario in which we have two controllers acting as leaders.
-                        tracing::error!(
-                            "Leader ({}) did not respond to step-down request: {}",
-                            leader.address,
-                            err
-                        );
-                        None
-                    }
-                }
-            }
-            None => {
-                tracing::info!(
-                    "No leader found to request step down from. Will build observed state."
-                );
                 None
             }
         }
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 4b0c9ac71d..996ca4d652 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -3,6 +3,7 @@ pytest_plugins = (
     "fixtures.parametrize",
     "fixtures.httpserver",
     "fixtures.compute_reconfigure",
+    "fixtures.storage_controller_proxy",
     "fixtures.neon_fixtures",
     "fixtures.benchmark_fixture",
     "fixtures.pg_stats",
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b76432127d..ec5a83601e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -497,6 +497,7 @@ class NeonEnvBuilder:
         pageserver_aux_file_policy: Optional[AuxFileStore] = None,
         pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
         safekeeper_extra_opts: Optional[list[str]] = None,
+        storage_controller_port_override: Optional[int] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -549,6 +550,8 @@ class NeonEnvBuilder:
 
         self.safekeeper_extra_opts = safekeeper_extra_opts
 
+        self.storage_controller_port_override = storage_controller_port_override
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -1054,6 +1057,7 @@ class NeonEnv:
     """
 
     BASE_PAGESERVER_ID = 1
+    storage_controller: NeonStorageController | NeonProxiedStorageController
 
     def __init__(self, config: NeonEnvBuilder):
         self.repo_dir = config.repo_dir
@@ -1084,27 +1088,41 @@ class NeonEnv:
         self.initial_tenant = config.initial_tenant
         self.initial_timeline = config.initial_timeline
 
-        # Find two adjacent ports for storage controller and its postgres DB.  This
-        # loop would eventually throw from get_port() if we run out of ports (extremely
-        # unlikely): usually we find two adjacent free ports on the first iteration.
-        while True:
-            self.storage_controller_port = self.port_distributor.get_port()
-            storage_controller_pg_port = self.port_distributor.get_port()
-            if storage_controller_pg_port == self.storage_controller_port + 1:
-                break
-
         # The URL for the pageserver to use as its control_plane_api config
-        self.control_plane_api: str = f"http://127.0.0.1:{self.storage_controller_port}/upcall/v1"
-        # The base URL of the storage controller
-        self.storage_controller_api: str = f"http://127.0.0.1:{self.storage_controller_port}"
+        if config.storage_controller_port_override is not None:
+            log.info(
+                f"Using storage controller api override {config.storage_controller_port_override}"
+            )
+
+            self.storage_controller_port = config.storage_controller_port_override
+            self.storage_controller = NeonProxiedStorageController(
+                self, config.storage_controller_port_override, config.auth_enabled
+            )
+        else:
+            # Find two adjacent ports for storage controller and its postgres DB.  This
+            # loop would eventually throw from get_port() if we run out of ports (extremely
+            # unlikely): usually we find two adjacent free ports on the first iteration.
+            while True:
+                storage_controller_port = self.port_distributor.get_port()
+                storage_controller_pg_port = self.port_distributor.get_port()
+                if storage_controller_pg_port == storage_controller_port + 1:
+                    break
+
+            self.storage_controller_port = storage_controller_port
+            self.storage_controller = NeonStorageController(
+                self, storage_controller_port, config.auth_enabled
+            )
+
+            log.info(
+                f"Using generated control_plane_api: {self.storage_controller.upcall_api_endpoint()}"
+            )
+
+        self.storage_controller_api: str = self.storage_controller.api_root()
+        self.control_plane_api: str = self.storage_controller.upcall_api_endpoint()
 
         # For testing this with a fake HTTP server, enable passing through a URL from config
         self.control_plane_compute_hook_api = config.control_plane_compute_hook_api
 
-        self.storage_controller: NeonStorageController = NeonStorageController(
-            self, config.auth_enabled
-        )
-
         self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
         self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
 
@@ -1869,16 +1887,24 @@ class NeonCli(AbstractNeonCli):
     def storage_controller_start(
         self,
         timeout_in_seconds: Optional[int] = None,
+        instance_id: Optional[int] = None,
+        base_port: Optional[int] = None,
     ):
         cmd = ["storage_controller", "start"]
         if timeout_in_seconds is not None:
             cmd.append(f"--start-timeout={timeout_in_seconds}s")
+        if instance_id is not None:
+            cmd.append(f"--instance-id={instance_id}")
+        if base_port is not None:
+            cmd.append(f"--base-port={base_port}")
         return self.raw_cli(cmd)
 
-    def storage_controller_stop(self, immediate: bool):
+    def storage_controller_stop(self, immediate: bool, instance_id: Optional[int] = None):
         cmd = ["storage_controller", "stop"]
         if immediate:
             cmd.extend(["-m", "immediate"])
+        if instance_id is not None:
+            cmd.append(f"--instance-id={instance_id}")
         return self.raw_cli(cmd)
 
     def pageserver_start(
@@ -2189,17 +2215,30 @@ class PageserverSchedulingPolicy(str, Enum):
     PAUSE_FOR_RESTART = "PauseForRestart"
 
 
+class StorageControllerLeadershipStatus(str, Enum):
+    LEADER = "leader"
+    STEPPED_DOWN = "stepped_down"
+    CANDIDATE = "candidate"
+
+
 class NeonStorageController(MetricsGetter, LogUtils):
-    def __init__(self, env: NeonEnv, auth_enabled: bool):
+    def __init__(self, env: NeonEnv, port: int, auth_enabled: bool):
         self.env = env
+        self.port: int = port
+        self.api: str = f"http://127.0.0.1:{port}"
         self.running = False
         self.auth_enabled = auth_enabled
         self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
-        self.logfile = self.workdir / "storage_controller.log"
+        self.logfile = self.env.repo_dir / "storage_controller_1" / "storage_controller.log"
 
-    def start(self, timeout_in_seconds: Optional[int] = None):
+    def start(
+        self,
+        timeout_in_seconds: Optional[int] = None,
+        instance_id: Optional[int] = None,
+        base_port: Optional[int] = None,
+    ):
         assert not self.running
-        self.env.neon_cli.storage_controller_start(timeout_in_seconds)
+        self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
         self.running = True
         return self
 
@@ -2209,6 +2248,12 @@ class NeonStorageController(MetricsGetter, LogUtils):
             self.running = False
         return self
 
+    def upcall_api_endpoint(self) -> str:
+        return f"{self.api}/upcall/v1"
+
+    def api_root(self) -> str:
+        return self.api
+
     @staticmethod
     def retryable_node_operation(op, ps_id, max_attempts, backoff):
         while max_attempts > 0:
@@ -2237,7 +2282,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
     def assert_no_errors(self):
         assert_no_errors(
-            self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors
+            self.logfile,
+            "storage_controller",
+            self.allowed_errors,
         )
 
     def pageserver_api(self) -> PageserverHttpClient:
@@ -2249,7 +2296,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         auth_token = None
         if self.auth_enabled:
             auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
-        return PageserverHttpClient(self.env.storage_controller_port, lambda: True, auth_token)
+        return PageserverHttpClient(self.port, lambda: True, auth_token)
 
     def request(self, method, *args, **kwargs) -> requests.Response:
         resp = requests.request(method, *args, **kwargs)
@@ -2266,13 +2313,13 @@ class NeonStorageController(MetricsGetter, LogUtils):
         return headers
 
     def get_metrics(self) -> Metrics:
-        res = self.request("GET", f"{self.env.storage_controller_api}/metrics")
+        res = self.request("GET", f"{self.api}/metrics")
         return parse_metrics(res.text)
 
     def ready(self) -> bool:
         status = None
         try:
-            resp = self.request("GET", f"{self.env.storage_controller_api}/ready")
+            resp = self.request("GET", f"{self.api}/ready")
             status = resp.status_code
         except StorageControllerApiException as e:
             status = e.status_code
@@ -2305,7 +2352,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         response = self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/attach-hook",
+            f"{self.api}/debug/v1/attach-hook",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2316,7 +2363,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
         self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/attach-hook",
+            f"{self.api}/debug/v1/attach-hook",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2327,7 +2374,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         """
         response = self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/inspect",
+            f"{self.api}/debug/v1/inspect",
             json={"tenant_shard_id": str(tenant_shard_id)},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2350,7 +2397,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"node_register({body})")
         self.request(
             "POST",
-            f"{self.env.storage_controller_api}/control/v1/node",
+            f"{self.api}/control/v1/node",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2359,7 +2406,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"node_delete({node_id})")
         self.request(
             "DELETE",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
+            f"{self.api}/control/v1/node/{node_id}",
             headers=self.headers(TokenScope.ADMIN),
         )
 
@@ -2367,7 +2414,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"node_drain({node_id})")
         self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
+            f"{self.api}/control/v1/node/{node_id}/drain",
             headers=self.headers(TokenScope.ADMIN),
         )
 
@@ -2375,7 +2422,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"cancel_node_drain({node_id})")
         self.request(
             "DELETE",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
+            f"{self.api}/control/v1/node/{node_id}/drain",
             headers=self.headers(TokenScope.ADMIN),
         )
 
@@ -2383,7 +2430,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"node_fill({node_id})")
         self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
+            f"{self.api}/control/v1/node/{node_id}/fill",
             headers=self.headers(TokenScope.ADMIN),
         )
 
@@ -2391,14 +2438,22 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"cancel_node_fill({node_id})")
         self.request(
             "DELETE",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
+            f"{self.api}/control/v1/node/{node_id}/fill",
             headers=self.headers(TokenScope.ADMIN),
         )
 
     def node_status(self, node_id):
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
+            f"{self.api}/control/v1/node/{node_id}",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        return response.json()
+
+    def get_leader(self):
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/leader",
             headers=self.headers(TokenScope.ADMIN),
         )
         return response.json()
@@ -2406,7 +2461,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def node_list(self):
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/control/v1/node",
+            f"{self.api}/control/v1/node",
             headers=self.headers(TokenScope.ADMIN),
         )
         return response.json()
@@ -2414,7 +2469,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def tenant_list(self):
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/debug/v1/tenant",
+            f"{self.api}/debug/v1/tenant",
             headers=self.headers(TokenScope.ADMIN),
         )
         return response.json()
@@ -2424,7 +2479,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         body["node_id"] = node_id
         self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/config",
+            f"{self.api}/control/v1/node/{node_id}/config",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2459,7 +2514,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         response = self.request(
             "POST",
-            f"{self.env.storage_controller_api}/v1/tenant",
+            f"{self.api}/v1/tenant",
             json=body,
             headers=self.headers(TokenScope.PAGE_SERVER_API),
         )
@@ -2472,7 +2527,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         """
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate",
+            f"{self.api}/debug/v1/tenant/{tenant_id}/locate",
             headers=self.headers(TokenScope.ADMIN),
         )
         body = response.json()
@@ -2485,7 +2540,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         """
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}",
+            f"{self.api}/control/v1/tenant/{tenant_id}",
             headers=self.headers(TokenScope.ADMIN),
         )
         response.raise_for_status()
@@ -2496,7 +2551,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     ) -> list[TenantShardId]:
         response = self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split",
+            f"{self.api}/control/v1/tenant/{tenant_id}/shard_split",
             json={"new_shard_count": shard_count, "new_stripe_size": shard_stripe_size},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2508,7 +2563,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
         self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_shard_id}/migrate",
+            f"{self.api}/control/v1/tenant/{tenant_shard_id}/migrate",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2519,7 +2574,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"tenant_policy_update({tenant_id}, {body})")
         self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/policy",
+            f"{self.api}/control/v1/tenant/{tenant_id}/policy",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2527,14 +2582,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def tenant_import(self, tenant_id: TenantId):
         self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/import",
+            f"{self.api}/debug/v1/tenant/{tenant_id}/import",
             headers=self.headers(TokenScope.ADMIN),
         )
 
     def reconcile_all(self):
         r = self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/reconcile_all",
+            f"{self.api}/debug/v1/reconcile_all",
             headers=self.headers(TokenScope.ADMIN),
         )
         r.raise_for_status()
@@ -2567,7 +2622,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         """
         self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/consistency_check",
+            f"{self.api}/debug/v1/consistency_check",
             headers=self.headers(TokenScope.ADMIN),
         )
         log.info("storage controller passed consistency check")
@@ -2640,7 +2695,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         self.request(
             "POST",
-            f"{self.env.storage_controller_api}/control/v1/metadata_health/update",
+            f"{self.api}/control/v1/metadata_health/update",
             json=body,
             headers=self.headers(TokenScope.SCRUBBER),
         )
@@ -2648,7 +2703,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def metadata_health_list_unhealthy(self):
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy",
+            f"{self.api}/control/v1/metadata_health/unhealthy",
             headers=self.headers(TokenScope.ADMIN),
         )
         return response.json()
@@ -2658,7 +2713,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         response = self.request(
             "POST",
-            f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated",
+            f"{self.api}/control/v1/metadata_health/outdated",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2681,7 +2736,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info("Asking storage controller to step down")
         response = self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/step_down",
+            f"{self.api}/control/v1/step_down",
             headers=self.headers(TokenScope.ADMIN),
         )
 
@@ -2698,7 +2753,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         res = self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/debug/v1/failpoints",
+            f"{self.api}/debug/v1/failpoints",
             json=[{"name": name, "actions": actions} for name, actions in pairs],
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2768,9 +2823,21 @@ class NeonStorageController(MetricsGetter, LogUtils):
                 parsed_tid, wait_ms=250
             )
 
-    @property
-    def workdir(self) -> Path:
-        return self.env.repo_dir
+    def get_leadership_status(self) -> StorageControllerLeadershipStatus:
+        metric_values = {}
+        for status in StorageControllerLeadershipStatus:
+            metric_value = self.get_metric_value(
+                "storage_controller_leadership_status", filter={"status": status}
+            )
+            metric_values[status] = metric_value
+
+        assert list(metric_values.values()).count(1) == 1
+
+        for status, metric_value in metric_values.items():
+            if metric_value == 1:
+                return status
+
+        raise AssertionError("unreachable")
 
     def __enter__(self) -> "NeonStorageController":
         return self
@@ -2784,6 +2851,59 @@ class NeonStorageController(MetricsGetter, LogUtils):
         self.stop(immediate=True)
 
 
+class NeonProxiedStorageController(NeonStorageController):
+    def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool):
+        super(NeonProxiedStorageController, self).__init__(env, proxy_port, auth_enabled)
+        self.instances: dict[int, dict[str, Any]] = {}
+
+    def start(
+        self,
+        timeout_in_seconds: Optional[int] = None,
+        instance_id: Optional[int] = None,
+        base_port: Optional[int] = None,
+    ):
+        assert instance_id is not None and base_port is not None
+
+        self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
+        self.instances[instance_id] = {"running": True}
+
+        self.running = True
+        return self
+
+    def stop_instance(
+        self, immediate: bool = False, instance_id: Optional[int] = None
+    ) -> "NeonStorageController":
+        assert instance_id in self.instances
+        if self.instances[instance_id]["running"]:
+            self.env.neon_cli.storage_controller_stop(immediate, instance_id)
+            self.instances[instance_id]["running"] = False
+
+        self.running = any(meta["running"] for meta in self.instances.values())
+        return self
+
+    def stop(self, immediate: bool = False) -> "NeonStorageController":
+        for iid, details in self.instances.items():
+            if details["running"]:
+                self.env.neon_cli.storage_controller_stop(immediate, iid)
+                self.instances[iid]["running"] = False
+
+        self.running = False
+        return self
+
+    def assert_no_errors(self):
+        for instance_id in self.instances.keys():
+            assert_no_errors(
+                self.env.repo_dir / f"storage_controller_{instance_id}" / "storage_controller.log",
+                "storage_controller",
+                self.allowed_errors,
+            )
+
+    def log_contains(
+        self, pattern: str, offset: None | LogCursor = None
+    ) -> Optional[Tuple[str, LogCursor]]:
+        raise NotImplementedError()
+
+
 @dataclass
 class LogCursor:
     _line_no: int
@@ -4520,7 +4640,7 @@ class StorageScrubber:
 
         base_args = [
             str(self.env.neon_binpath / "storage_scrubber"),
-            f"--controller-api={self.env.storage_controller_api}",
+            f"--controller-api={self.env.storage_controller.api_root()}",
         ]
         args = base_args + args
 
diff --git a/test_runner/fixtures/storage_controller_proxy.py b/test_runner/fixtures/storage_controller_proxy.py
new file mode 100644
index 0000000000..3477f8b1f2
--- /dev/null
+++ b/test_runner/fixtures/storage_controller_proxy.py
@@ -0,0 +1,73 @@
+import re
+from typing import Any, Optional
+
+import pytest
+import requests
+from pytest_httpserver import HTTPServer
+from werkzeug.datastructures import Headers
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+from fixtures.log_helper import log
+
+
+class StorageControllerProxy:
+    def __init__(self, server: HTTPServer):
+        self.server: HTTPServer = server
+        self.listen: str = f"http://{server.host}:{server.port}"
+        self.routing_to: Optional[str] = None
+
+    def route_to(self, storage_controller_api: str):
+        self.routing_to = storage_controller_api
+
+    def port(self) -> int:
+        return self.server.port
+
+    def upcall_api_endpoint(self) -> str:
+        return f"{self.listen}/upcall/v1"
+
+
+def proxy_request(method: str, url: str, **kwargs) -> requests.Response:
+    return requests.request(method, url, **kwargs)
+
+
+@pytest.fixture(scope="function")
+def storage_controller_proxy(make_httpserver):
+    """
+    Proxies requests into the storage controller to the currently
+    selected storage controller instance via `StorageControllerProxy.route_to`.
+
+    This fixture is intended for tests that need to run multiple instances
+    of the storage controller at the same time.
+    """
+    server = make_httpserver
+
+    self = StorageControllerProxy(server)
+
+    log.info(f"Storage controller proxy listening on {self.listen}")
+
+    def handler(request: Request):
+        if self.route_to is None:
+            log.info(f"Storage controller proxy has no routing configured for {request.url}")
+            return Response("Routing not configured", status=503)
+
+        route_to_url = f"{self.routing_to}{request.path}"
+
+        log.info(f"Routing {request.url} to {route_to_url}")
+
+        args: dict[str, Any] = {"headers": request.headers}
+        if request.is_json:
+            args["json"] = request.json
+
+        response = proxy_request(request.method, route_to_url, **args)
+
+        headers = Headers()
+        for key, value in response.headers.items():
+            headers.add(key, value)
+
+        return Response(response.content, headers=headers, status=response.status_code)
+
+    self.server.expect_request(re.compile(".*")).respond_with_handler(handler)
+
+    yield self
+    server.clear()
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 4dc9f7caae..80f1c9e4e3 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -403,7 +403,7 @@ def wait_until(
         try:
             res = func()
         except Exception as e:
-            log.info("waiting for %s iteration %s failed", func, i + 1)
+            log.info("waiting for %s iteration %s failed: %s", func, i + 1, e)
             last_exception = e
             if show_intermediate_error:
                 log.info(e)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 7d98ff2923..95c35e9641 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1,3 +1,4 @@
+import concurrent.futures
 import json
 import threading
 import time
@@ -16,6 +17,7 @@ from fixtures.neon_fixtures import (
     PageserverSchedulingPolicy,
     PgBin,
     StorageControllerApiException,
+    StorageControllerLeadershipStatus,
     TokenScope,
     last_flush_lsn_upload,
 )
@@ -30,7 +32,9 @@ from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
 )
 from fixtures.pg_version import PgVersion
+from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
+from fixtures.storage_controller_proxy import StorageControllerProxy
 from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
 from fixtures.workload import Workload
 from mypy_boto3_s3.type_defs import (
@@ -2093,6 +2097,131 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
     )
 
 
+# This is a copy of NeonEnv.start which injects the instance id and port
+# into the call to NeonStorageController.start
+def start_env(env: NeonEnv, storage_controller_port: int):
+    timeout_in_seconds = 30
+
+    # Storage controller starts first, so that pageserver /re-attach calls don't
+    # bounce through retries on startup
+    env.storage_controller.start(timeout_in_seconds, 1, storage_controller_port)
+
+    # Wait for storage controller readiness to prevent unnecessary post start-up
+    # reconcile.
+    env.storage_controller.wait_until_ready()
+
+    # Start up broker, pageserver and all safekeepers
+    futs = []
+    with concurrent.futures.ThreadPoolExecutor(
+        max_workers=2 + len(env.pageservers) + len(env.safekeepers)
+    ) as executor:
+        futs.append(
+            executor.submit(lambda: env.broker.try_start() or None)
+        )  # The `or None` is for the linter
+
+        for pageserver in env.pageservers:
+            futs.append(
+                executor.submit(
+                    lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)
+                )
+            )
+
+        for safekeeper in env.safekeepers:
+            futs.append(
+                executor.submit(
+                    lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)
+                )
+            )
+
+    for f in futs:
+        f.result()
+
+
+@pytest.mark.parametrize("step_down_times_out", [False, True])
+def test_storage_controller_leadership_transfer(
+    neon_env_builder: NeonEnvBuilder,
+    storage_controller_proxy: StorageControllerProxy,
+    port_distributor: PortDistributor,
+    step_down_times_out: bool,
+):
+    neon_env_builder.num_pageservers = 3
+
+    neon_env_builder.storage_controller_config = {
+        "database_url": f"127.0.0.1:{port_distributor.get_port()}",
+        "start_as_candidate": True,
+    }
+
+    neon_env_builder.storage_controller_port_override = storage_controller_proxy.port()
+
+    storage_controller_1_port = port_distributor.get_port()
+    storage_controller_2_port = port_distributor.get_port()
+
+    storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
+
+    env = neon_env_builder.init_configs()
+    start_env(env, storage_controller_1_port)
+
+    assert (
+        env.storage_controller.get_leadership_status() == StorageControllerLeadershipStatus.LEADER
+    )
+    leader = env.storage_controller.get_leader()
+    assert leader["address"] == f"http://127.0.0.1:{storage_controller_1_port}/"
+
+    if step_down_times_out:
+        env.storage_controller.configure_failpoints(
+            ("sleep-on-step-down-handling", "return(10000)")
+        )
+        env.storage_controller.allowed_errors.append(".*request was dropped before completing.*")
+
+    tenant_count = 2
+    shard_count = 4
+    tenants = set(TenantId.generate() for _ in range(0, tenant_count))
+
+    for tid in tenants:
+        env.storage_controller.tenant_create(
+            tid, shard_count=shard_count, placement_policy={"Attached": 1}
+        )
+    env.storage_controller.reconcile_until_idle()
+
+    env.storage_controller.start(
+        timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port
+    )
+
+    if not step_down_times_out:
+
+        def previous_stepped_down():
+            assert (
+                env.storage_controller.get_leadership_status()
+                == StorageControllerLeadershipStatus.STEPPED_DOWN
+            )
+
+        wait_until(5, 1, previous_stepped_down)
+
+    storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
+
+    def new_becomes_leader():
+        assert (
+            env.storage_controller.get_leadership_status()
+            == StorageControllerLeadershipStatus.LEADER
+        )
+
+    wait_until(15, 1, new_becomes_leader)
+    leader = env.storage_controller.get_leader()
+    assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/"
+
+    env.storage_controller.wait_until_ready()
+    env.storage_controller.consistency_check()
+
+    if step_down_times_out:
+        env.storage_controller.allowed_errors.extend(
+            [
+                ".*Leader.*did not respond to step-down request.*",
+                ".*Send step down request failed.*",
+                ".*Send step down request still failed.*",
+            ]
+        )
+
+
 def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
     # single unsharded tenant, two locations
     neon_env_builder.num_pageservers = 2

From 25e7d321f474e5cbc5ac53ed42de697a48db50db Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Aug 2024 12:51:05 +0300
Subject: [PATCH 57/62] safekeeper: cross check divergence point in
 ProposerElected handling.

Previously, we protected from multiple ProposerElected messages from the same
walproposer with the following condition:

msg.term == self.get_last_log_term() && self.flush_lsn() >
msg.start_streaming_at

It is not exhaustive, i.e. we could still proceed to truncating WAL even though
safekeeper inserted something since the divergence point has been
calculated. While it was most likely safe because walproposer can't use
safekeeper position to commit WAL until last_log_term reaches the current
walproposer term, let's be more careful and properly calculate the divergence
point like walproposer does.
---
 safekeeper/src/safekeeper.rs | 62 +++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 19 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 33ec39b852..0814d9ba67 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -92,7 +92,7 @@ impl TermHistory {
     }
 
     /// Find point of divergence between leader (walproposer) term history and
-    /// safekeeper. Arguments are not symmetrics as proposer history ends at
+    /// safekeeper. Arguments are not symmetric as proposer history ends at
     /// +infinity while safekeeper at flush_lsn.
     /// C version is at walproposer SendProposerElected.
     pub fn find_highest_common_point(
@@ -701,7 +701,13 @@ where
             .with_label_values(&["handle_elected"])
             .start_timer();
 
-        info!("received ProposerElected {:?}", msg);
+        info!(
+            "received ProposerElected {:?}, term={}, last_log_term={}, flush_lsn={}",
+            msg,
+            self.state.acceptor_state.term,
+            self.get_last_log_term(),
+            self.flush_lsn()
+        );
         if self.state.acceptor_state.term < msg.term {
             let mut state = self.state.start_change();
             state.acceptor_state.term = msg.term;
@@ -713,22 +719,43 @@ where
             return Ok(None);
         }
 
-        // This might happen in a rare race when another (old) connection from
-        // the same walproposer writes + flushes WAL after this connection
-        // already sent flush_lsn in VoteRequest. It is generally safe to
-        // proceed, but to prevent commit_lsn surprisingly going down we should
-        // either refuse the session (simpler) or skip the part we already have
-        // from the stream (can be implemented).
-        if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at {
-            bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
-                   msg.term, self.flush_lsn(), msg.start_streaming_at)
+        // Before truncating WAL check-cross the check divergence point received
+        // from the walproposer.
+        let sk_th = self.get_term_history();
+        let last_common_point = match TermHistory::find_highest_common_point(
+            &msg.term_history,
+            &sk_th,
+            self.flush_lsn(),
+        ) {
+            // No common point. Expect streaming from the beginning of the
+            // history like walproposer while we don't have proper init.
+            None => *msg.term_history.0.first().ok_or(anyhow::anyhow!(
+                "empty walproposer term history {:?}",
+                msg.term_history
+            ))?,
+            Some(lcp) => lcp,
+        };
+        // This is expected to happen in a rare race when another connection
+        // from the same walproposer writes + flushes WAL after this connection
+        // sent flush_lsn in VoteRequest; for instance, very late
+        // ProposerElected message delivery after another connection was
+        // established and wrote WAL. In such cases error is transient;
+        // reconnection makes safekeeper send newest term history and flush_lsn
+        // and walproposer recalculates the streaming point. OTOH repeating
+        // error indicates a serious bug.
+        if last_common_point.lsn != msg.start_streaming_at {
+            bail!("refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}",
+                    last_common_point, msg.start_streaming_at,
+                    self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history,
+            );
         }
-        // Otherwise we must never attempt to truncate committed data.
+
+        // We are also expected to never attempt to truncate committed data.
         assert!(
             msg.start_streaming_at >= self.state.inmem.commit_lsn,
-            "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}",
-            msg.start_streaming_at,
-            self.state.inmem.commit_lsn
+            "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}",
+            msg.start_streaming_at, self.state.inmem.commit_lsn,
+            self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history,
         );
 
         // Before first WAL write initialize its segment. It makes first segment
@@ -743,9 +770,6 @@ where
                 .await?;
         }
 
-        // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to
-        // intersection of our history and history from msg
-
         // truncate wal, update the LSNs
         self.wal_store.truncate_wal(msg.start_streaming_at).await?;
 
@@ -1069,7 +1093,7 @@ mod tests {
 
         let pem = ProposerElected {
             term: 1,
-            start_streaming_at: Lsn(1),
+            start_streaming_at: Lsn(3),
             term_history: TermHistory(vec![TermLsn {
                 term: 1,
                 lsn: Lsn(3),

From e2d89f7991bc9cea88661e50722a02346b7b6485 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 16 Aug 2024 13:35:02 +0100
Subject: [PATCH 58/62] pageserver: prioritize secondary downloads to get most
 recent layers first, except l0s (#8729)

## Problem

When a secondary location is trying to catch up while a tenant is
receiving new writes, it can become quite wasteful:
- Downloading L0s which are soon destroyed by compaction to L1s
- Downloading older layer files which are soon made irrelevant when
covered by image layers.

## Summary of changes

Sort the layer files in the heatmap:
- L0 layers are the lowest priority
- Other layers are sorted to download the highest LSNs first.
---
 pageserver/src/tenant/secondary/heatmap.rs |   8 +-
 pageserver/src/tenant/timeline.rs          | 134 +++++++++++++++++++--
 2 files changed, 130 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index 166483ba5d..4a8e66d38a 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -29,16 +29,16 @@ pub(super) struct HeatMapTenant {
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapTimeline {
     #[serde_as(as = "DisplayFromStr")]
-    pub(super) timeline_id: TimelineId,
+    pub(crate) timeline_id: TimelineId,
 
-    pub(super) layers: Vec<HeatMapLayer>,
+    pub(crate) layers: Vec<HeatMapLayer>,
 }
 
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapLayer {
-    pub(super) name: LayerName,
-    pub(super) metadata: LayerFileMetadata,
+    pub(crate) name: LayerName,
+    pub(crate) metadata: LayerFileMetadata,
 
     #[serde_as(as = "TimestampSeconds<i64>")]
     pub(super) access_time: SystemTime,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 01e77fa1b1..26dc87c373 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2977,11 +2977,7 @@ impl Timeline {
                 LayerVisibilityHint::Visible => {
                     // Layer is visible to one or more read LSNs: elegible for inclusion in layer map
                     let last_activity_ts = layer.latest_activity();
-                    Some(HeatMapLayer::new(
-                        layer.layer_desc().layer_name(),
-                        layer.metadata(),
-                        last_activity_ts,
-                    ))
+                    Some((layer.layer_desc(), layer.metadata(), last_activity_ts))
                 }
                 LayerVisibilityHint::Covered => {
                     // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
@@ -2990,7 +2986,23 @@ impl Timeline {
             }
         });
 
-        let layers = resident.collect();
+        let mut layers = resident.collect::<Vec<_>>();
+
+        // Sort layers in order of which to download first.  For a large set of layers to download, we
+        // want to prioritize those layers which are most likely to still be in the resident many minutes
+        // or hours later:
+        // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
+        //   only exist for a few minutes before being compacted into L1s.
+        // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
+        //   the layer is likely to be covered by an image layer during compaction.
+        layers.sort_by_key(|(desc, _meta, _atime)| {
+            std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
+        });
+
+        let layers = layers
+            .into_iter()
+            .map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
+            .collect();
 
         Some(HeatMapTimeline::new(self.timeline_id, layers))
     }
@@ -4516,6 +4528,7 @@ impl DurationRecorder {
 /// the layer descriptor requires the user to provide the ranges, which should cover all
 /// keys specified in the `data` field.
 #[cfg(test)]
+#[derive(Clone)]
 pub struct DeltaLayerTestDesc {
     pub lsn_range: Range<Lsn>,
     pub key_range: Range<Key>,
@@ -4545,6 +4558,13 @@ impl DeltaLayerTestDesc {
             data,
         }
     }
+
+    pub(crate) fn layer_name(&self) -> LayerName {
+        LayerName::Delta(super::storage_layer::DeltaLayerName {
+            key_range: self.key_range.clone(),
+            lsn_range: self.lsn_range.clone(),
+        })
+    }
 }
 
 impl Timeline {
@@ -5768,12 +5788,110 @@ fn is_send() {
 
 #[cfg(test)]
 mod tests {
+    use pageserver_api::key::Key;
     use utils::{id::TimelineId, lsn::Lsn};
 
-    use crate::tenant::{
-        harness::TenantHarness, storage_layer::Layer, timeline::EvictionError, Timeline,
+    use crate::{
+        repository::Value,
+        tenant::{
+            harness::{test_img, TenantHarness},
+            layer_map::LayerMap,
+            storage_layer::{Layer, LayerName},
+            timeline::{DeltaLayerTestDesc, EvictionError},
+            Timeline,
+        },
     };
 
+    #[tokio::test]
+    async fn test_heatmap_generation() {
+        let harness = TenantHarness::create("heatmap_generation").await.unwrap();
+
+        let covered_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
+            Lsn(0x10)..Lsn(0x20),
+            vec![(
+                Key::from_hex("620000000033333333444444445500000000").unwrap(),
+                Lsn(0x11),
+                Value::Image(test_img("foo")),
+            )],
+        );
+        let visible_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
+            Lsn(0x10)..Lsn(0x20),
+            vec![(
+                Key::from_hex("720000000033333333444444445500000000").unwrap(),
+                Lsn(0x11),
+                Value::Image(test_img("foo")),
+            )],
+        );
+        let l0_delta = DeltaLayerTestDesc::new(
+            Lsn(0x20)..Lsn(0x30),
+            Key::from_hex("000000000000000000000000000000000000").unwrap()
+                ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
+            vec![(
+                Key::from_hex("720000000033333333444444445500000000").unwrap(),
+                Lsn(0x25),
+                Value::Image(test_img("foo")),
+            )],
+        );
+        let delta_layers = vec![
+            covered_delta.clone(),
+            visible_delta.clone(),
+            l0_delta.clone(),
+        ];
+
+        let image_layer = (
+            Lsn(0x40),
+            vec![(
+                Key::from_hex("620000000033333333444444445500000000").unwrap(),
+                test_img("bar"),
+            )],
+        );
+        let image_layers = vec![image_layer];
+
+        let (tenant, ctx) = harness.load().await;
+        let timeline = tenant
+            .create_test_timeline_with_layers(
+                TimelineId::generate(),
+                Lsn(0x10),
+                14,
+                &ctx,
+                delta_layers,
+                image_layers,
+                Lsn(0x100),
+            )
+            .await
+            .unwrap();
+
+        // Layer visibility is an input to heatmap generation, so refresh it first
+        timeline.update_layer_visibility().await.unwrap();
+
+        let heatmap = timeline
+            .generate_heatmap()
+            .await
+            .expect("Infallible while timeline is not shut down");
+
+        assert_eq!(heatmap.timeline_id, timeline.timeline_id);
+
+        // L0 should come last
+        assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
+
+        let mut last_lsn = Lsn::MAX;
+        for layer in heatmap.layers {
+            // Covered layer should be omitted
+            assert!(layer.name != covered_delta.layer_name());
+
+            let layer_lsn = match &layer.name {
+                LayerName::Delta(d) => d.lsn_range.end,
+                LayerName::Image(i) => i.lsn,
+            };
+
+            // Apart from L0s, newest Layers should come first
+            if !LayerMap::is_l0(layer.name.key_range()) {
+                assert!(layer_lsn <= last_lsn);
+                last_lsn = layer_lsn;
+            }
+        }
+    }
+
     #[tokio::test]
     async fn two_layer_eviction_attempts_at_the_same_time() {
         let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")

From c6b6b7700a31dc945276ccd091d33373548f518c Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Fri, 16 Aug 2024 11:13:18 -0700
Subject: [PATCH 59/62] Fix superuser check in test_snap_files (#8749)

## Problem
Current superuser check always passes because it returns a tuple like
`(False,)`, and then the `if not superuser` passes.

## Summary of changes
Fixes the issue by unwrapping the tuple. Verified that it works against
a project where I don't have superuser.
---
 test_runner/performance/test_logical_replication.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index c4e42a7834..077f73ac06 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -282,15 +282,16 @@ def test_snap_files(
 
     env = benchmark_project_pub.pgbench_env
     connstr = benchmark_project_pub.connstr
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
 
     with psycopg2.connect(connstr) as conn:
         conn.autocommit = True
         with conn.cursor() as cur:
             cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
-            is_super = cur.fetchall()[0]
+            is_super = cur.fetchall()[0][0]
             assert is_super, "This benchmark won't work if we don't have superuser"
 
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
+
     conn = psycopg2.connect(connstr)
     conn.autocommit = True
     cur = conn.cursor()

From 2be69af6c3a595c90f747dabe44fe898b59375c9 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 16 Aug 2024 22:19:44 +0300
Subject: [PATCH 60/62] Track holes to be able to reuse them once LFC limit is
 increased (#8575)

## Problem

Multiple increase/decrease LFC limit may cause unlimited growth of LFC
file because punched holes while LFC shrinking are not reused when LFC
is extended.

## Summary of changes

Keep track of holes and reused them when LFC size is increased.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c                 | 117 ++++++++++++++++++++-----
 pgxn/neon/neon_pgversioncompat.h       |   4 +
 test_runner/regress/test_lfc_resize.py |  28 ++++--
 3 files changed, 119 insertions(+), 30 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 1894e8c72a..479209a537 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -41,6 +41,8 @@
 
 #include "hll.h"
 
+#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
+
 /*
  * Local file cache is used to temporary store relations pages in local file system.
  * All blocks of all relations are stored inside one file and addressed using shared hash map.
@@ -51,19 +53,43 @@
  *
  * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about
  * its consistency.
+
+ *
+ * ## Holes
+ *
+ * The LFC can be resized on the fly, up to a maximum size that's determined
+ * at server startup (neon.max_file_cache_size). After server startup, we
+ * expand the underlying file when needed, until it reaches the soft limit
+ * (neon.file_cache_size_limit). If the soft limit is later reduced, we shrink
+ * the LFC by punching holes in the underlying file with a
+ * fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't
+ * shrink, but the disk space it uses does.
+ *
+ * Each hole is tracked by a dummy FileCacheEntry, which are kept in the
+ * 'holes' linked list. They are entered into the chunk hash table, with a
+ * special key where the blockNumber is used to store the 'offset' of the
+ * hole, and all other fields are zero. Holes are never looked up in the hash
+ * table, we only enter them there to have a FileCacheEntry that we can keep
+ * in the linked list. If the soft limit is raised again, we reuse the holes
+ * before extending the nominal size of the file.
  */
 
 /* Local file storage allocation chunk.
- * Should be power of two and not less than 32. Using larger than page chunks can
+ * Should be power of two. Using larger than page chunks can
  * 1. Reduce hash-map memory footprint: 8TB database contains billion pages
  *    and size of hash entry is 40 bytes, so we need 40Gb just for hash map.
  *    1Mb chunks can reduce hash map size to 320Mb.
  * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
  */
 #define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
+/*
+ * Smaller chunk seems to be better for OLTP workload
+ */
+// #define BLOCKS_PER_CHUNK	8 /* 64kb chunk */
 #define MB					((uint64)1024*1024)
 
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
+#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32)
 
 typedef struct FileCacheEntry
 {
@@ -71,8 +97,8 @@ typedef struct FileCacheEntry
 	uint32		hash;
 	uint32		offset;
 	uint32		access_count;
-	uint32		bitmap[BLOCKS_PER_CHUNK / 32];
-	dlist_node	lru_node;		/* LRU list node */
+	uint32		bitmap[CHUNK_BITMAP_SIZE];
+	dlist_node	list_node;		/* LRU/holes list node */
 } FileCacheEntry;
 
 typedef struct FileCacheControl
@@ -87,6 +113,7 @@ typedef struct FileCacheControl
 	uint64		writes;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
+	dlist_head  holes;          /* double linked list of punched holes */
 	HyperLogLogState wss_estimation; /* estimation of working set size */
 } FileCacheControl;
 
@@ -135,6 +162,7 @@ lfc_disable(char const *op)
 		lfc_ctl->used = 0;
 		lfc_ctl->limit = 0;
 		dlist_init(&lfc_ctl->lru);
+		dlist_init(&lfc_ctl->holes);
 
 		if (lfc_desc > 0)
 		{
@@ -214,18 +242,18 @@ lfc_shmem_startup(void)
 	if (!found)
 	{
 		int			fd;
-		uint32		lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
+		uint32		n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size);
 
 		lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
 		info.keysize = sizeof(BufferTag);
 		info.entrysize = sizeof(FileCacheEntry);
 
 		/*
-		 * lfc_size+1 because we add new element to hash table before eviction
+		 * n_chunks+1 because we add new element to hash table before eviction
 		 * of victim
 		 */
 		lfc_hash = ShmemInitHash("lfc_hash",
-								 lfc_size + 1, lfc_size + 1,
+								 n_chunks + 1, n_chunks + 1,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
 		lfc_ctl->generation = 0;
@@ -235,6 +263,7 @@ lfc_shmem_startup(void)
 		lfc_ctl->misses = 0;
 		lfc_ctl->writes = 0;
 		dlist_init(&lfc_ctl->lru);
+		dlist_init(&lfc_ctl->holes);
 
 		/* Initialize hyper-log-log structure for estimating working set size */
 		initSHLL(&lfc_ctl->wss_estimation);
@@ -310,14 +339,31 @@ lfc_change_limit_hook(int newval, void *extra)
 		 * Shrink cache by throwing away least recently accessed chunks and
 		 * returning their space to file system
 		 */
-		FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+		FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
+		FileCacheEntry *hole;
+		uint32		offset = victim->offset;
+		uint32		hash;
+		bool		found;
+		BufferTag	holetag;
 
-		Assert(victim->access_count == 0);
+		CriticalAssert(victim->access_count == 0);
 #ifdef FALLOC_FL_PUNCH_HOLE
 		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
 			neon_log(LOG, "Failed to punch hole in file: %m");
 #endif
+		/* We remove the old entry, and re-enter a hole to the hash table */
 		hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
+
+		memset(&holetag, 0, sizeof(holetag));
+		holetag.blockNum = offset;
+		hash = get_hash_value(lfc_hash, &holetag);
+		hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found);
+		hole->hash = hash;
+		hole->offset = offset;
+		hole->access_count = 0;
+		CriticalAssert(!found);
+		dlist_push_tail(&lfc_ctl->holes, &hole->list_node);
+
 		lfc_ctl->used -= 1;
 	}
 	lfc_ctl->limit = new_size;
@@ -409,6 +455,8 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);
 
 	LWLockAcquire(lfc_lock, LW_SHARED);
@@ -440,6 +488,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	tag.forkNum = forkNum;
 	tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1));
 
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);
 
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -470,7 +519,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	{
 		bool		has_remaining_pages;
 
-		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
+		for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
 		{
 			if (entry->bitmap[i] != 0)
 			{
@@ -485,8 +534,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 		 */
 		if (!has_remaining_pages)
 		{
-			dlist_delete(&entry->lru_node);
-			dlist_push_head(&lfc_ctl->lru, &entry->lru_node);
+			dlist_delete(&entry->list_node);
+			dlist_push_head(&lfc_ctl->lru, &entry->list_node);
 		}
 	}
 
@@ -525,6 +574,8 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);
 
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -551,7 +602,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	}
 	/* Unlink entry from LRU list to pin it for the duration of IO operation */
 	if (entry->access_count++ == 0)
-		dlist_delete(&entry->lru_node);
+		dlist_delete(&entry->list_node);
 	generation = lfc_ctl->generation;
 	entry_offset = entry->offset;
 
@@ -569,12 +620,12 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	if (lfc_ctl->generation == generation)
 	{
-		Assert(LFC_ENABLED());
+		CriticalAssert(LFC_ENABLED());
 		lfc_ctl->hits += 1;
 		pgBufferUsage.file_cache.hits += 1;
-		Assert(entry->access_count > 0);
+		CriticalAssert(entry->access_count > 0);
 		if (--entry->access_count == 0)
-			dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
+			dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
 	}
 	else
 		result = false;
@@ -613,6 +664,8 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	CopyNRelFileInfoToBufTag(tag, rinfo);
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);
 
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -632,7 +685,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 		 * operation
 		 */
 		if (entry->access_count++ == 0)
-			dlist_delete(&entry->lru_node);
+			dlist_delete(&entry->list_node);
 	}
 	else
 	{
@@ -655,13 +708,26 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 		if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
 		{
 			/* Cache overflow: evict least recently used chunk */
-			FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+			FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
 
-			Assert(victim->access_count == 0);
+			CriticalAssert(victim->access_count == 0);
 			entry->offset = victim->offset; /* grab victim's chunk */
 			hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
 			neon_log(DEBUG2, "Swap file cache page");
 		}
+		else if (!dlist_is_empty(&lfc_ctl->holes))
+		{
+			/* We can reuse a hole that was left behind when the LFC was shrunk previously */
+			FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
+			uint32		offset = hole->offset;
+			bool		found;
+
+			hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
+			CriticalAssert(found);
+
+			lfc_ctl->used += 1;
+			entry->offset = offset;	/* reuse the hole */
+		}
 		else
 		{
 			lfc_ctl->used += 1;
@@ -689,11 +755,11 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 
 		if (lfc_ctl->generation == generation)
 		{
-			Assert(LFC_ENABLED());
+			CriticalAssert(LFC_ENABLED());
 			/* Place entry to the head of LRU list */
-			Assert(entry->access_count > 0);
+			CriticalAssert(entry->access_count > 0);
 			if (--entry->access_count == 0)
-				dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
+				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
 
 			entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
 		}
@@ -708,7 +774,6 @@ typedef struct
 } NeonGetStatsCtx;
 
 #define NUM_NEON_GET_STATS_COLS	2
-#define NUM_NEON_GET_STATS_ROWS	3
 
 PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
 Datum
@@ -744,7 +809,6 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 						   INT8OID, -1, 0);
 
 		fctx->tupdesc = BlessTupleDesc(tupledesc);
-		funcctx->max_calls = NUM_NEON_GET_STATS_ROWS;
 		funcctx->user_fctx = fctx;
 
 		/* Return to original context when allocating transient memory */
@@ -778,6 +842,11 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 			if (lfc_ctl)
 				value = lfc_ctl->writes;
 			break;
+		case 4:
+			key = "file_cache_size";
+			if (lfc_ctl)
+				value = lfc_ctl->size;
+			break;
 		default:
 			SRF_RETURN_DONE(funcctx);
 	}
@@ -901,7 +970,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				hash_seq_init(&status, lfc_hash);
 				while ((entry = hash_seq_search(&status)) != NULL)
 				{
-					for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++)
+					for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
 						n_pages += pg_popcount32(entry->bitmap[i]);
 				}
 			}
diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h
index f19732cbbb..addb6ccce6 100644
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -54,6 +54,10 @@
 
 #define BufTagGetNRelFileInfo(tag) tag.rnode
 
+#define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode)
+
+#define InvalidRelFileNumber InvalidOid
+
 #define SMgrRelGetRelInfo(reln) \
 	(reln->smgr_rnode.node)
 
diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py
index 2a3442448a..1b2c7f808f 100644
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -1,3 +1,7 @@
+import os
+import random
+import re
+import subprocess
 import threading
 import time
 
@@ -17,17 +21,17 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
         "test_lfc_resize",
         config_lines=[
             "neon.file_cache_path='file.cache'",
-            "neon.max_file_cache_size=1GB",
-            "neon.file_cache_size_limit=1GB",
+            "neon.max_file_cache_size=512MB",
+            "neon.file_cache_size_limit=512MB",
         ],
     )
     n_resize = 10
-    scale = 10
+    scale = 100
 
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
         pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
-        pg_bin.run_capture(["pgbench", "-c4", f"-T{n_resize}", "-Mprepared", connstr])
+        pg_bin.run_capture(["pgbench", "-c10", f"-T{n_resize}", "-Mprepared", "-S", connstr])
 
     thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
     thread.start()
@@ -35,9 +39,21 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
     conn = endpoint.connect()
     cur = conn.cursor()
 
-    for i in range(n_resize):
-        cur.execute(f"alter system set neon.file_cache_size_limit='{i*10}MB'")
+    for _ in range(n_resize):
+        size = random.randint(1, 512)
+        cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'")
         cur.execute("select pg_reload_conf()")
         time.sleep(1)
 
+    cur.execute("alter system set neon.file_cache_size_limit='100MB'")
+    cur.execute("select pg_reload_conf()")
+
     thread.join()
+
+    lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache"
+    lfc_file_size = os.path.getsize(lfc_file_path)
+    res = subprocess.run(["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True)
+    lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
+    log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}")
+    assert lfc_file_size <= 512 * 1024 * 1024
+    assert int(lfc_file_blocks) <= 128 * 1024

From 7131ac4730f7268a8624a9c7345c23938cc8b6a6 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Sat, 17 Aug 2024 11:48:53 -0400
Subject: [PATCH 61/62] refactor(scrubber): add unified command suitable for
 cron job (#8635)

Part of #8128.

## Description

This PR creates a unified command to run both physical gc and metadata
health check as a cron job. This also enables us to add additional tasks
to the cron job in the future.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 storage_scrubber/src/main.rs                  | 240 ++++++++++++------
 .../src/pageserver_physical_gc.rs             |  16 +-
 .../src/scan_pageserver_metadata.rs           |   2 +-
 3 files changed, 175 insertions(+), 83 deletions(-)

diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index cbc836755a..3935e513e3 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -3,9 +3,10 @@ use camino::Utf8PathBuf;
 use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
 use pageserver_api::shard::TenantShardId;
 use reqwest::{Method, Url};
+use storage_controller_client::control_api;
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
-use storage_scrubber::scan_pageserver_metadata::scan_metadata;
+use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata;
 use storage_scrubber::tenant_snapshot::SnapshotDownloader;
 use storage_scrubber::{find_large_objects, ControllerClientConfig};
 use storage_scrubber::{
@@ -68,7 +69,7 @@ enum Command {
         #[arg(long = "tenant-id", num_args = 0..)]
         tenant_ids: Vec<TenantShardId>,
         #[arg(long = "post", default_value_t = false)]
-        post_to_storage_controller: bool,
+        post_to_storcon: bool,
         #[arg(long, default_value = None)]
         /// For safekeeper node_kind only, points to db with debug dump
         dump_db_connstr: Option<String>,
@@ -100,6 +101,16 @@ enum Command {
         #[arg(long = "concurrency", short = 'j', default_value_t = 64)]
         concurrency: usize,
     },
+    CronJob {
+        // PageserverPhysicalGc
+        #[arg(long = "min-age")]
+        gc_min_age: humantime::Duration,
+        #[arg(short, long, default_value_t = GcMode::IndicesOnly)]
+        gc_mode: GcMode,
+        // ScanMetadata
+        #[arg(long = "post", default_value_t = false)]
+        post_to_storcon: bool,
+    },
 }
 
 #[tokio::main]
@@ -117,6 +128,7 @@ async fn main() -> anyhow::Result<()> {
         Command::TenantSnapshot { .. } => "tenant-snapshot",
         Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc",
         Command::FindLargeObjects { .. } => "find-large-objects",
+        Command::CronJob { .. } => "cron-job",
     };
     let _guard = init_logging(&format!(
         "{}_{}_{}_{}.log",
@@ -126,12 +138,13 @@ async fn main() -> anyhow::Result<()> {
         chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
     ));
 
-    let controller_client_conf = cli.controller_api.map(|controller_api| {
+    let controller_client = cli.controller_api.map(|controller_api| {
         ControllerClientConfig {
             controller_api,
             // Default to no key: this is a convenience when working in a development environment
             controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
         }
+        .build_client()
     });
 
     match cli.command {
@@ -139,7 +152,7 @@ async fn main() -> anyhow::Result<()> {
             json,
             tenant_ids,
             node_kind,
-            post_to_storage_controller,
+            post_to_storcon,
             dump_db_connstr,
             dump_db_table,
         } => {
@@ -178,53 +191,14 @@ async fn main() -> anyhow::Result<()> {
                 }
                 Ok(())
             } else {
-                if controller_client_conf.is_none() && post_to_storage_controller {
-                    return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
-                }
-                match scan_metadata(bucket_config.clone(), tenant_ids).await {
-                    Err(e) => {
-                        tracing::error!("Failed: {e}");
-                        Err(e)
-                    }
-                    Ok(summary) => {
-                        if json {
-                            println!("{}", serde_json::to_string(&summary).unwrap())
-                        } else {
-                            println!("{}", summary.summary_string());
-                        }
-
-                        if post_to_storage_controller {
-                            if let Some(conf) = controller_client_conf {
-                                let controller_client = conf.build_client();
-                                let body = summary.build_health_update_request();
-                                controller_client
-                                    .dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
-                                        Method::POST,
-                                        "control/v1/metadata_health/update".to_string(),
-                                        Some(body),
-                                    )
-                                    .await?;
-                            }
-                        }
-
-                        if summary.is_fatal() {
-                            tracing::error!("Fatal scrub errors detected");
-                        } else if summary.is_empty() {
-                            // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                            // scrubber they were likely expecting to scan something, and if we see no timelines
-                            // at all then it's likely due to some configuration issues like a bad prefix
-                            tracing::error!(
-                                "No timelines found in bucket {} prefix {}",
-                                bucket_config.bucket,
-                                bucket_config
-                                    .prefix_in_bucket
-                                    .unwrap_or("<none>".to_string())
-                            );
-                        }
-
-                        Ok(())
-                    }
-                }
+                scan_pageserver_metadata_cmd(
+                    bucket_config,
+                    controller_client.as_ref(),
+                    tenant_ids,
+                    json,
+                    post_to_storcon,
+                )
+                .await
             }
         }
         Command::FindGarbage {
@@ -254,31 +228,14 @@ async fn main() -> anyhow::Result<()> {
             min_age,
             mode,
         } => {
-            match (&controller_client_conf, mode) {
-                (Some(_), _) => {
-                    // Any mode may run when controller API is set
-                }
-                (None, GcMode::Full) => {
-                    // The part of physical GC where we erase ancestor layers cannot be done safely without
-                    // confirming the most recent complete shard split with the controller.  Refuse to run, rather
-                    // than doing it unsafely.
-                    return Err(anyhow!("Full physical GC requires `--controller-api` and `--controller-jwt` to run"));
-                }
-                (None, GcMode::DryRun | GcMode::IndicesOnly) => {
-                    // These GcModes do not require the controller to run.
-                }
-            }
-
-            let summary = pageserver_physical_gc(
-                bucket_config,
-                controller_client_conf,
+            pageserver_physical_gc_cmd(
+                &bucket_config,
+                controller_client.as_ref(),
                 tenant_ids,
-                min_age.into(),
+                min_age,
                 mode,
             )
-            .await?;
-            println!("{}", serde_json::to_string(&summary).unwrap());
-            Ok(())
+            .await
         }
         Command::FindLargeObjects {
             min_size,
@@ -295,5 +252,142 @@ async fn main() -> anyhow::Result<()> {
             println!("{}", serde_json::to_string(&summary).unwrap());
             Ok(())
         }
+        Command::CronJob {
+            gc_min_age,
+            gc_mode,
+            post_to_storcon,
+        } => {
+            run_cron_job(
+                bucket_config,
+                controller_client.as_ref(),
+                gc_min_age,
+                gc_mode,
+                post_to_storcon,
+            )
+            .await
+        }
+    }
+}
+
+/// Runs the scrubber cron job.
+/// 1. Do pageserver physical gc
+/// 2. Scan pageserver metadata
+pub async fn run_cron_job(
+    bucket_config: BucketConfig,
+    controller_client: Option<&control_api::Client>,
+    gc_min_age: humantime::Duration,
+    gc_mode: GcMode,
+    post_to_storcon: bool,
+) -> anyhow::Result<()> {
+    tracing::info!(%gc_min_age, %gc_mode, "Running pageserver-physical-gc");
+    pageserver_physical_gc_cmd(
+        &bucket_config,
+        controller_client,
+        Vec::new(),
+        gc_min_age,
+        gc_mode,
+    )
+    .await?;
+    tracing::info!(%post_to_storcon, node_kind = %NodeKind::Pageserver, "Running scan-metadata");
+    scan_pageserver_metadata_cmd(
+        bucket_config,
+        controller_client,
+        Vec::new(),
+        true,
+        post_to_storcon,
+    )
+    .await?;
+
+    Ok(())
+}
+
+pub async fn pageserver_physical_gc_cmd(
+    bucket_config: &BucketConfig,
+    controller_client: Option<&control_api::Client>,
+    tenant_shard_ids: Vec<TenantShardId>,
+    min_age: humantime::Duration,
+    mode: GcMode,
+) -> anyhow::Result<()> {
+    match (controller_client, mode) {
+        (Some(_), _) => {
+            // Any mode may run when controller API is set
+        }
+        (None, GcMode::Full) => {
+            // The part of physical GC where we erase ancestor layers cannot be done safely without
+            // confirming the most recent complete shard split with the controller.  Refuse to run, rather
+            // than doing it unsafely.
+            return Err(anyhow!(
+                "Full physical GC requires `--controller-api` and `--controller-jwt` to run"
+            ));
+        }
+        (None, GcMode::DryRun | GcMode::IndicesOnly) => {
+            // These GcModes do not require the controller to run.
+        }
+    }
+
+    let summary = pageserver_physical_gc(
+        bucket_config,
+        controller_client,
+        tenant_shard_ids,
+        min_age.into(),
+        mode,
+    )
+    .await?;
+    println!("{}", serde_json::to_string(&summary).unwrap());
+    Ok(())
+}
+
+pub async fn scan_pageserver_metadata_cmd(
+    bucket_config: BucketConfig,
+    controller_client: Option<&control_api::Client>,
+    tenant_shard_ids: Vec<TenantShardId>,
+    json: bool,
+    post_to_storcon: bool,
+) -> anyhow::Result<()> {
+    if controller_client.is_none() && post_to_storcon {
+        return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
+    }
+    match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await {
+        Err(e) => {
+            tracing::error!("Failed: {e}");
+            Err(e)
+        }
+        Ok(summary) => {
+            if json {
+                println!("{}", serde_json::to_string(&summary).unwrap())
+            } else {
+                println!("{}", summary.summary_string());
+            }
+
+            if post_to_storcon {
+                if let Some(client) = controller_client {
+                    let body = summary.build_health_update_request();
+                    client
+                        .dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
+                            Method::POST,
+                            "control/v1/metadata_health/update".to_string(),
+                            Some(body),
+                        )
+                        .await?;
+                }
+            }
+
+            if summary.is_fatal() {
+                tracing::error!("Fatal scrub errors detected");
+            } else if summary.is_empty() {
+                // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                // scrubber they were likely expecting to scan something, and if we see no timelines
+                // at all then it's likely due to some configuration issues like a bad prefix
+                tracing::error!(
+                    "No timelines found in bucket {} prefix {}",
+                    bucket_config.bucket,
+                    bucket_config
+                        .prefix_in_bucket
+                        .unwrap_or("<none>".to_string())
+                );
+            }
+
+            Ok(())
+        }
     }
 }
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index c8b1ed49f4..20d9bd6dd4 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -4,9 +4,7 @@ use std::time::{Duration, SystemTime};
 
 use crate::checks::{list_timeline_blobs, BlobDataParseResult};
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{
-    init_remote, BucketConfig, ControllerClientConfig, NodeKind, RootTarget, TenantShardTimelineId,
-};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use aws_sdk_s3::Client;
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
@@ -473,8 +471,8 @@ async fn gc_ancestor(
 /// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and
 /// make sure that object listings don't get slowed down by large numbers of garbage objects.
 pub async fn pageserver_physical_gc(
-    bucket_config: BucketConfig,
-    controller_client_conf: Option<ControllerClientConfig>,
+    bucket_config: &BucketConfig,
+    controller_client: Option<&control_api::Client>,
     tenant_shard_ids: Vec<TenantShardId>,
     min_age: Duration,
     mode: GcMode,
@@ -558,7 +556,7 @@ pub async fn pageserver_physical_gc(
         let timelines = timelines.map_ok(|ttid| {
             gc_timeline(
                 &s3_client,
-                &bucket_config,
+                bucket_config,
                 &min_age,
                 &target,
                 mode,
@@ -574,7 +572,7 @@ pub async fn pageserver_physical_gc(
     }
 
     // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
-    let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
+    let Some(client) = controller_client else {
         tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
         return Ok(summary);
     };
@@ -583,13 +581,13 @@ pub async fn pageserver_physical_gc(
         .unwrap()
         .into_inner()
         .unwrap()
-        .into_gc_ancestors(&controller_client, &mut summary)
+        .into_gc_ancestors(client, &mut summary)
         .await;
 
     for ancestor_shard in ancestor_shards {
         gc_ancestor(
             &s3_client,
-            &bucket_config,
+            bucket_config,
             &target,
             &min_age,
             ancestor_shard,
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index b9630056e1..2409b7b132 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -116,7 +116,7 @@ Index versions: {version_summary}
 }
 
 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
-pub async fn scan_metadata(
+pub async fn scan_pageserver_metadata(
     bucket_config: BucketConfig,
     tenant_ids: Vec<TenantShardId>,
 ) -> anyhow::Result<MetadataSummary> {

From 188bde7f0776636310260cbf636922d1029add7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sun, 18 Aug 2024 19:32:10 +0200
Subject: [PATCH 62/62] Default image compression to zstd at level 1 (#8677)

After the rollout has succeeded, we now set the default image
compression to be enabled.

We also remove its explicit mention from `neon_fixtures.py` added in
#8368 as it is now the default (and we switch to `zstd(1)` which is a
bit nicer on CPU time).

Part of https://github.com/neondatabase/neon/issues/5431
---
 pageserver/src/config.rs                            | 12 +++++-------
 pageserver/src/tenant/storage_layer/split_writer.rs |  7 ++++++-
 test_runner/fixtures/neon_fixtures.py               |  1 -
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 3ac5ac539f..0ebaf78840 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -50,7 +50,6 @@ pub mod defaults {
         DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
         DEFAULT_PG_LISTEN_PORT,
     };
-    use pageserver_api::models::ImageCompressionAlgorithm;
     pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
 
     pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
@@ -90,8 +89,7 @@ pub mod defaults {
 
     pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
 
-    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
-        ImageCompressionAlgorithm::Disabled;
+    pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)";
 
     pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
 
@@ -478,7 +476,7 @@ impl PageServerConfigBuilder {
             max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
-            image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
+            image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: Set(L0FlushConfig::default()),
             compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
@@ -1065,7 +1063,7 @@ impl PageServerConf {
                 NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                     .expect("Invalid default constant"),
             ),
-            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
+            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
             l0_flush: L0FlushConfig::default(),
             compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
@@ -1305,7 +1303,7 @@ background_task_maximum_delay = '334 s'
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
                 ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
@@ -1378,7 +1376,7 @@ background_task_maximum_delay = '334 s'
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
                 ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index d7bfe48c60..e12e29cd45 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -208,6 +208,8 @@ impl SplitDeltaLayerWriter {
 
 #[cfg(test)]
 mod tests {
+    use rand::{RngCore, SeedableRng};
+
     use crate::{
         tenant::{
             harness::{TenantHarness, TIMELINE_ID},
@@ -229,7 +231,10 @@ mod tests {
     }
 
     fn get_large_img() -> Bytes {
-        vec![0; 8192].into()
+        let mut rng = rand::rngs::SmallRng::seed_from_u64(42);
+        let mut data = vec![0; 8192];
+        rng.fill_bytes(&mut data);
+        data.into()
     }
 
     #[tokio::test]
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ec5a83601e..ba6fbc003a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1162,7 +1162,6 @@ class NeonEnv:
                 "listen_http_addr": f"localhost:{pageserver_port.http}",
                 "pg_auth_type": pg_auth_type,
                 "http_auth_type": http_auth_type,
-                "image_compression": "zstd",
             }
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine